tianyu.chu 2 тижнів тому
батько
коміт
8d2ade5330
100 змінених файлів з 9399 додано та 1 видалено
  1. 7 1
      .gitignore
  2. 34 0
      CLAUDE.md
  3. 63 0
      README.md
  4. 63 0
      bin/common/functions.sh
  5. 70 0
      bin/common/init.sh
  6. 25 0
      bin/common/print-constants.sh
  7. 798 0
      bin/datax-gc-generator.py
  8. 142 0
      bin/datax-job-config-generator.py
  9. 13 0
      bin/datax-multiple-hive-job-starter.py
  10. 262 0
      bin/datax-multiple-hive-job-starter.sh
  11. 13 0
      bin/datax-multiple-job-starter.py
  12. 264 0
      bin/datax-multiple-job-starter.sh
  13. 13 0
      bin/datax-single-job-starter.py
  14. 277 0
      bin/datax-single-job-starter.sh
  15. 83 0
      bin/dingtalk-work-alert.sh
  16. 122 0
      bin/excel_to_hive.py
  17. 194 0
      bin/flume-control.sh
  18. 13 0
      bin/hive-exec-job-starter.py
  19. 186 0
      bin/hive-exec.sh
  20. 191 0
      bin/spark-sql-starter.py
  21. 0 0
      conf/.gitkeep
  22. 123 0
      dw_base/__init__.py
  23. 0 0
      dw_base/common/__init__.py
  24. 14 0
      dw_base/common/alerter_constants.py
  25. 8 0
      dw_base/common/config_constants.py
  26. 14 0
      dw_base/common/container.py
  27. 20 0
      dw_base/common/template_constants.py
  28. 2 0
      dw_base/database/__init__.py
  29. 184 0
      dw_base/database/mongodb_utils.py
  30. 185 0
      dw_base/database/mysql_utils.py
  31. 0 0
      dw_base/datax/__init__.py
  32. 0 0
      dw_base/datax/datasources/__init__.py
  33. 16 0
      dw_base/datax/datasources/clickhouse_data_source.py
  34. 27 0
      dw_base/datax/datasources/data_source.py
  35. 35 0
      dw_base/datax/datasources/data_source_factory.py
  36. 21 0
      dw_base/datax/datasources/elasticseach_data_source.py
  37. 30 0
      dw_base/datax/datasources/hbase_data_source.py
  38. 23 0
      dw_base/datax/datasources/hdfs_data_source.py
  39. 23 0
      dw_base/datax/datasources/kafka_data_source.py
  40. 30 0
      dw_base/datax/datasources/mongo_data_source.py
  41. 26 0
      dw_base/datax/datasources/mysql_data_source.py
  42. 16 0
      dw_base/datax/datasources/postgresql_data_source.py
  43. 43 0
      dw_base/datax/datax_constants.py
  44. 40 0
      dw_base/datax/datax_utils.py
  45. 89 0
      dw_base/datax/job_config_generator.py
  46. 0 0
      dw_base/datax/plugins/__init__.py
  47. 168 0
      dw_base/datax/plugins/plugin.py
  48. 70 0
      dw_base/datax/plugins/plugin_factory.py
  49. 2 0
      dw_base/datax/plugins/reader/__init__.py
  50. 74 0
      dw_base/datax/plugins/reader/clickhouse_reader.py
  51. 78 0
      dw_base/datax/plugins/reader/hdfs_reader.py
  52. 42 0
      dw_base/datax/plugins/reader/mongo_reader.py
  53. 249 0
      dw_base/datax/plugins/reader/mysql_reader.py
  54. 76 0
      dw_base/datax/plugins/reader/postgresql_reader.py
  55. 14 0
      dw_base/datax/plugins/reader/reader.py
  56. 2 0
      dw_base/datax/plugins/writer/__init__.py
  57. 57 0
      dw_base/datax/plugins/writer/clickhouse_writer.py
  58. 28 0
      dw_base/datax/plugins/writer/elasticsearch_writer.py
  59. 155 0
      dw_base/datax/plugins/writer/hbase_writer.py
  60. 95 0
      dw_base/datax/plugins/writer/hdfs_writer.py
  61. 85 0
      dw_base/datax/plugins/writer/kafka_writer.py
  62. 72 0
      dw_base/datax/plugins/writer/mongo_writer.py
  63. 60 0
      dw_base/datax/plugins/writer/mysql_writer.py
  64. 57 0
      dw_base/datax/plugins/writer/postgresql_writer.py
  65. 14 0
      dw_base/datax/plugins/writer/writer.py
  66. 3 0
      dw_base/ds/__init__.py
  67. 19 0
      dw_base/ds/config/base_config.yaml
  68. 9 0
      dw_base/ds/config/process_code.yaml
  69. 76 0
      dw_base/ds/ds_start_workflow.py
  70. 3 0
      dw_base/elasticsearch/__init__.py
  71. 3 0
      dw_base/flink/__init__.py
  72. 3 0
      dw_base/hive/__init__.py
  73. 30 0
      dw_base/hive/hive_constants.py
  74. 141 0
      dw_base/hive/hive_utils.py
  75. 3 0
      dw_base/ml/__init__.py
  76. 3 0
      dw_base/oss/__init__.py
  77. 235 0
      dw_base/oss/oss2_util.py
  78. 0 0
      dw_base/scheduler/__init__.py
  79. 186 0
      dw_base/scheduler/country_count_dingtalk.py
  80. 240 0
      dw_base/scheduler/dingtalk_mirror_monitor.py
  81. 102 0
      dw_base/scheduler/dingtalk_notifier.py
  82. 370 0
      dw_base/scheduler/dingtalk_task_monitor.py
  83. 368 0
      dw_base/scheduler/dingtalk_task_monitor_new.py
  84. 45 0
      dw_base/scheduler/drop_daily_full_snapshot_tbls.py
  85. 45 0
      dw_base/scheduler/drop_partitions.py
  86. 498 0
      dw_base/scheduler/ent_interface_dingtalk.py
  87. 132 0
      dw_base/scheduler/ent_interface_dingtalk_call.py
  88. 141 0
      dw_base/scheduler/ent_interface_dingtalk_top10.py
  89. 242 0
      dw_base/scheduler/ent_interface_dingtalk_update.py
  90. 185 0
      dw_base/scheduler/get_oldmongo_cjfs.py
  91. 185 0
      dw_base/scheduler/get_oldmongo_sldw.py
  92. 90 0
      dw_base/scheduler/get_oldmongo_sldw_detail.py
  93. 102 0
      dw_base/scheduler/get_oldmongo_stat.py
  94. 139 0
      dw_base/scheduler/get_oldmongo_ysfs.py
  95. 0 0
      dw_base/scheduler/mg2es/__init__.py
  96. 53 0
      dw_base/scheduler/mg2es/conf_reader.py
  97. 37 0
      dw_base/scheduler/mg2es/dict_redis2hive.py
  98. 47 0
      dw_base/scheduler/mg2es/es_index_backup.py
  99. 214 0
      dw_base/scheduler/mg2es/es_operator.py
  100. 250 0
      dw_base/scheduler/mg2es/es_tmpl_gen.py

+ 7 - 1
.gitignore

@@ -2,6 +2,7 @@
 .git
 .idea
 bin/loop.sh
+conf/bak
 conf/datax/generated
 ignored
 lib
@@ -13,4 +14,9 @@ data
 target
 *.nosync
 *.icloud
-*.zip
+*.zip
+.codex
+.claude
+launch-pad
+requirements.txt.bak
+CLAUDE.md

+ 34 - 0
CLAUDE.md

@@ -0,0 +1,34 @@
+# CLAUDE.md
+
+本项目是数据仓库工程 `poyee-data-warehouse`,当前处于**从老项目 `tendata-warehouse-release` 原地渐进式重构**的阶段。
+
+## 开始任何任务前必读
+
+1. **先读 `kb/README.md`** —— 这是项目的权威知识库入口,包含项目现状、文档分组、阅读路径
+2. **冷启动重点文档**:
+   - `kb/README.md` —— 项目现状速读(老新项目共存、launch-pad 定位、当前进度)
+   - `kb/00-项目架构.md` —— 目标态目录结构、模块职责、执行时序
+   - `kb/90-重构路线.md` —— P0-P3 重构清单与硬编码清单
+   - `kb/92-重构进度.md` —— 重构任务 checklist 与当前状态
+3. **写建表 SQL / DataX ini 前必读**:`kb/21-命名规范.md`(五段式命名、业务域代码、字段词根字典)
+4. **写加工 SQL 前必读**:`kb/20-数仓分层与建模.md` + `kb/00-项目架构.md` §9 样板 job 结构
+
+## 项目关键事实
+
+- **项目根目录 = 本文件所在目录**。`kb/` 是文档,其余大部分目录(`tendata/`、`launch-pad/`、`bin/`、`publish.sh` 等)都是**老项目代码**
+- **重构模式**:原地改造,不是新建空目录
+- **`launch-pad/` 不做业务迁移**:里面是**上个项目**(与当前业务无关)的历史代码,仅作样板 SQL / DataX ini 写法的参考,新业务 SQL 全部从零开发,完成后 `launch-pad/` 整体删除
+- **`tendata/` → `dw_base/`**:核心模块重命名,涉及模块改名 + 所有 `from tendata ...` import + SQL 中的 `ADD FILE tendata/...` + `tendata.zip` 打包命令的全局替换
+- **DDL 与计算 SQL 物理分离,DDL 单一来源**:`manual/ddl/` 是**所有 DDL 的唯一来源**(首次建表 + 后续 ALTER),采用 migration 模式 —— 每次 DDL 操作是一个不可变文件,**绝不回头改老文件**。`jobs/` 只写 `INSERT OVERWRITE`,不写 CREATE TABLE。**不存在第二个 ddl 目录**(不要建顶层 `ddl/`)。详见 `kb/00-项目架构.md` §9.1 / §9.6
+- **存储格式**:所有分层一律 `STORED AS ORC` + `orc.compress=NONE`(不压缩)
+- **raw 层契约**:全字段 STRING,同步任务不做类型转换,类型化交给 ods 层(详见 `kb/20-数仓分层与建模.md` §8)。一次性历史导入用 `his` 快照类型 + `o` 周期,表名形如 `raw_xxx_his_o`,不分区,CTAS 一步建表(详见 `kb/00-项目架构.md` §9.3 / §9.3.1)
+- **部署**:项目仓库名 = 部署目录名 = `poyee-data-warehouse`,部署路径 `/home/bigdata/release/poyee-data-warehouse/`,部署用户 `bigdata`
+- **敏感数据源配置**:不入仓库,由运维维护在 `/home/bigdata/release/datasource/{db_type}/{instance}.ini`
+
+## 协作约定
+
+- **不要擅自创建新目录**
+- **不要迁移 launch-pad/ 的内容**:它的业务与新项目完全无关
+- **所有建表、字段、文件命名**必须符合 `kb/21-命名规范.md`,没有例外
+- **所有重构操作前**先查 `kb/92-重构进度.md`,避免重复做已完成的工作或越过未完成的前置任务
+- **修改 kb/ 文档时**同步更新 `kb/README.md` 的索引和 `kb/92-重构进度.md` 的状态

+ 63 - 0
README.md

@@ -0,0 +1,63 @@
+# dw-project
+
+基于 PySpark + DataX 的数据仓库项目,负责多源数据采集、清洗、分层加工和分发。
+
+## 技术栈
+
+- **计算引擎**:PySpark 2.4 on YARN (CDH 6.3.2)
+- **数据集成**:DataX(阿里开源)
+- **元数据管理**:Hive MetaStore
+- **存储**:HDFS (ORC)、RDS PostgreSQL、Elasticsearch
+- **调度**:DolphinScheduler
+- **告警**:企业微信机器人
+
+## 目录结构
+
+```
+dw-project/
+├── bin/           # 启动脚本(Shell + Python 入口)
+├── jobs/          # 业务代码,按数仓分层组织
+│   ├── raw/       #   原始数据采集(DataX ini)
+│   ├── ods/       #   贴源层(SQL)
+│   ├── dwd/       #   明细层(SQL)
+│   ├── dws/       #   汇总层(SQL)
+│   ├── tdm/       #   主题域模型层(SQL)
+│   └── ads/       #   应用层(SQL + 导出 ini)
+├── dw_base/       # 通用库(Spark 引擎、DataX 引擎、工具函数、UDF)
+├── kb/            # 知识库(项目文档)
+├── conf/          # 配置(非敏感项,样例 + 环境配置)
+├── publish.sh     # 集群部署脚本
+└── requirements.txt
+```
+
+数据源连接配置(含账密)存放在项目同级目录 `datasource/` 下,由运维维护,不纳入版本控制。
+
+## 主要执行入口
+
+| 脚本 | 用途 | 示例 |
+|------|------|------|
+| `bin/spark-sql-starter.py` | 执行 Spark SQL | `-f jobs/customs/001india/02incr/01india_im/02dwd.sql -dt 20250101` |
+| `bin/datax-multiple-hive-job-starter.sh` | MySQL→Hive 批量采集(主力) | `-gcd conf/datax/config/mysql-hdfs/prod -start-date 20250101 -parallel` |
+| `bin/datax-multiple-job-starter.sh` | 通用批量 DataX 同步 | `-gcd jobs/customs/001india/02incr/01india_im/ -start-date 20250101` |
+| `bin/datax-single-job-starter.sh` | 单个 DataX 同步 | `-gc jobs/xxx/from_mongo.ini -start-date 20250101` |
+
+
+## 数仓分层
+
+```
+PG/ES ──DataX(raw)──> RAW ──> ODS ──> DWD ──> DWS ──> TDM ──> ADS
+                                                                           │
+                                                             DataX/BrokerLoad
+                                                                           │
+                                                         Doris / ES / MongoDB
+```
+
+## 开发环境
+
+- 使用 PyCharm 远程 SSH 连接服务器开发调试
+- Python 3.6.8,依赖见 `requirements.txt`
+- 部署:`publish.sh` 通过 git pull + rsync 分发到集群各节点
+
+## 文档
+
+详细文档见 `kb/` 目录

+ 63 - 0
bin/common/functions.sh

@@ -0,0 +1,63 @@
+#!/bin/bash
+FONT_COLOR=(31 32 33 34 35 36)
+function cow_says() {
+  font_index=$((RANDOM % ${#FONT_COLOR[@]}))
+  echo -en "\033[0;${FONT_COLOR[${font_index}]};5m"
+  echo -en "${DO_RESET}"
+}
+
+
+function pretty_print() {
+  echo -e "${NORM_CYN}$(date '+%Y-%m-%d %H:%M:%S') ${DO_RESET}${1}${DO_RESET}"
+}
+
+
+function date_range() {
+  dt=${1}
+  if [ "${#dt}" = 17 ] && [[ "${dt}" =~ [0-9]{8}-[0-9]{8} ]]; then
+    start_day=${dt:0:8}
+    end_day=${dt:9:16}
+  elif [ "${#dt}" = 9 ] && [[ "${dt}" =~ [0-9]{8}- ]]; then
+    start_day=${dt:0:8}
+    if [ "$(uname)" = "Linux" ]; then
+      end_day=$(date -d '-1 day' +%Y%m%d)
+    else
+      end_day=$(date -v-1d +%Y%m%d)
+    fi
+  elif [ "${#dt}" -ge 17 ] && [[ "${dt}" =~ [0-9]{8},[0-9]{8}[,[0-9]{8}]* ]]; then
+    DATE_RANGE=(${dt//,/ })
+    return
+  else
+    DATE_RANGE=("${dt}")
+    return
+  fi
+  DATE_RANGE=("${start_day}")
+  while [ "${start_day}" -lt "${end_day}" ]; do
+    if [ "$(uname)" = 'Linux' ]; then
+      start_day=$(date -d "1 day ${start_day}" +%Y%m%d)
+    else
+      start_day_ts=$(date -j -f %Y%m%d "${start_day}" +%s)
+      start_day_ts=$(("${start_day_ts}" + 86400))
+      start_day=$(date -r ${start_day_ts} +%Y%m%d)
+    fi
+    DATE_RANGE+=("${start_day}")
+  done
+}
+
+function waiting() {
+  while true; do
+    echo -en "\r\033[J请稍候."
+    sleep 0.5
+    echo -en "\r\033[J请稍候.."
+    sleep 0.5
+    echo -en "\r\033[J请稍候..."
+    sleep 0.5
+    echo -en "\r\033[J请稍候...."
+    sleep 0.5
+    echo -en "\r\033[J请稍候....."
+    sleep 0.5
+    echo -en "\r\033[J请稍候......"
+    sleep 0.5
+  done
+}
+

+ 70 - 0
bin/common/init.sh

@@ -0,0 +1,70 @@
+#!/bin/bash
+if [ -z "${BASE_DIR}" ]; then
+  BASE_DIR=$(
+    cd "$(dirname "$(realpath "$0")")/.." || exit
+    pwd
+  )
+fi
+. "${BASE_DIR}"/bin/common/functions.sh
+BANNED_USER="root"
+RELEASE_USER="alvis"
+USER="$(whoami)"
+CURRENT_HOST=$(hostname -s)
+RELEASE_HOST="m3"
+RELEASE_ROOT_DIR="/home/alvis/release"
+PROJECT_NAME=$(basename "${BASE_DIR}")
+PYTHON3_PATH="/usr/bin/python3"
+DATAX_HOME="/opt/module/datax"
+DATAX_WORKERS=(
+  m3 d1 d2 d3 d4
+)
+declare -A DATAX_WORKERS_WEIGHTS=(
+  ["m3"]=1
+  ["d1"]=2 ["d2"]=2 ["d3"]=3 ["d4"]=3
+)
+DATAX_WORKERS_QUEUE=()
+# 定义一个map存储机器的ip地址和分配的权重
+for key in ${!DATAX_WORKERS_WEIGHTS[*]}; do
+  for ((i = 0; i < ${DATAX_WORKERS_WEIGHTS[$key]}; i++)); do
+    DATAX_WORKERS_QUEUE+=("$key")
+  done
+done
+if [ "${USER}" == "${RELEASE_USER}" ]; then
+  LOG_ROOT_DIR="/opt/data/log"
+  IS_RUN_BY_RELEASE_USER="1"
+  pretty_print "${NORM_MGT}Project ${NORM_GRN}${PROJECT_NAME}${NORM_MGT} is running by release user ${NORM_GRN}${RELEASE_USER}"
+elif [ "${USER}" == "${BANNED_USER}" ]; then
+  LOG_ROOT_DIR="/opt/data/log"
+  pretty_print "${NORM_RED}Project ${NORM_GRN}${PROJECT_NAME}${NORM_RED} is running by banned user ${NORM_GRN}${BANNED_USER}${NORM_RED}, exit with error code ${NORM_GRN}18"
+  exit 18
+else
+  LOG_ROOT_DIR="/opt/data/log/users/${USER}"
+  IS_RUN_BY_NORMAL_USER="1"
+  . "${BASE_DIR}"/bin/common/print-constants.sh
+  if [ "${CURRENT_HOST}" == "${RELEASE_HOST}" ] && [ -n "${IS_RUN_BY_NORMAL_USER}" ]; then
+    cow_says
+  fi
+  pretty_print "${NORM_MGT}Project ${NORM_GRN}${PROJECT_NAME}${NORM_MGT} is running by normal user ${NORM_GRN}${USER}"
+fi
+if [[ "${BASE_DIR}" =~ "${RELEASE_ROOT_DIR}/${PROJECT_NAME}/"* ]]; then
+  IS_RUN_IN_RELEASE_DIR="1"
+  pretty_print "${NORM_MGT}Project ${NORM_GRN}${PROJECT_NAME}${NORM_MGT} is running in release dir ${NORM_GRN}${RELEASE_ROOT_DIR}/${PROJECT_NAME}"
+else
+  pretty_print "${NORM_MGT}Project ${NORM_GRN}${PROJECT_NAME}${NORM_MGT} is running in normal user dir ${NORM_GRN}${BASE_DIR}"
+fi
+
+export CURRENT_HOST
+export DATAX_HOME
+export DATAX_WORKERS
+export BA_LITTLE_CUTE
+export DCP_LITTLE_CUTE
+export ETL_LITTLE_CUTE
+export SKB_LITTLE_CUTE
+export REALTIME_LITTLE_CUTE
+export LOG_ROOT_DIR
+export PYTHON3_PATH
+export RELEASE_ROOT_DIR
+export IS_RUN_IN_RELEASE_DIR
+export IS_RUN_BY_RELEASE_USER
+echo -en "${NORM_GRN}"
+echo -en "${DO_RESET}"

+ 25 - 0
bin/common/print-constants.sh

@@ -0,0 +1,25 @@
+#!/bin/bash
+DO_RESET="\033[0m"
+CHG_BOLD="\033[1m"
+TO_BLINK="\033[5m"
+NORM_RED="\033[0;31m"
+NORM_GRN="\033[0;32m"
+NORM_YEL="\033[0;33m"
+NORM_BLU="\033[0;34m"
+NORM_MGT="\033[0;35m"
+NORM_CYN="\033[0;36m"
+NORM_WHT="\033[0;37m"
+BOLD_RED="\033[1;31m"
+BOLD_GRN="\033[1;32m"
+BOLD_YEL="\033[1;33m"
+BOLD_BLU="\033[1;34m"
+BOLD_MGT="\033[1;35m"
+BOLD_CYN="\033[1;36m"
+BOLD_WHT="\033[1;37m"
+BGRD_RED="\033[41m"
+BGRD_GRN="\033[42m"
+BGRD_YEL="\033[43m"
+BGRD_BLU="\033[44m"
+BGRD_MGT="\033[45m"
+BGRD_CYN="\033[46m"
+BGRD_WHT="\033[47m"

+ 798 - 0
bin/datax-gc-generator.py

@@ -0,0 +1,798 @@
+#!/usr/bin/env /usr/bin/python3
+# -*- coding:utf-8 -*-
+"""
+  生成DataX作业配置文件生成器的配置文件
+"""
+# -*- coding=utf-8 -*-
+
+import os
+import re
+import sys
+import time
+
+base_dir = os.path.dirname(os.path.dirname(os.path.abspath(__file__)))
+sys.path.append(base_dir)
+from dw_base import DO_RESET, NORM_CYN, NORM_GRN, NORM_MGT, NORM_YEL, NORM_RED
+from dw_base.database.mysql_utils import MySQLHandler
+from dw_base.datax.datasources.hdfs_data_source import HDFSDataSource
+from dw_base.datax.datasources.mysql_data_source import MySQLDataSource
+from dw_base.datax.datax_utils import convert_mysql_column_types
+from dw_base.datax.plugins.reader.hdfs_reader import HDFSReader
+from dw_base.datax.plugins.reader.mysql_reader import MySQLReader
+from dw_base.datax.plugins.writer.hbase_writer import HBaseWriter
+from dw_base.datax.plugins.writer.hdfs_writer import HDFSWriter
+from dw_base.datax.plugins.writer.kafka_writer import KafkaWriter
+from dw_base.datax.plugins.writer.mongo_writer import MongoWriter, MONGO_SPECIAL_WORDS_DICT
+from dw_base.hive.hive_utils import get_hive_database_name, get_hive_table_prefix
+from dw_base.spark.spark_sql import SparkSQL
+from dw_base.utils.common_utils import exist
+from dw_base.utils.config_utils import parse_args
+from dw_base.utils.file_utils import write_file, append_file, list_files, load_json_file
+from dw_base.utils.log_utils import pretty_print
+from dw_base.utils.string_utils import snake_case_to_pascal_case, snake_case_to_camel_case
+
+
+def usage(code: int):
+    print(
+        f'{NORM_MGT}Usage: {sys.argv[0]}\n'
+        f'{NORM_CYN}\t[-H/--H/--help]                   打印脚本使用方法{DO_RESET}'
+    )
+    if not from_system or not to_system:
+        print(
+            f'{NORM_MGT}Usage: {sys.argv[0]}\n'
+            f'{NORM_GRN}\t<[-]-from< /=>from>               源系统类型(默认MySQL),目前支持hdfs、mysql\n'
+            f'{NORM_GRN}\t<[-]-to< /=>to>                   目标系统类型(默认HDFS),目前支持elasticsearch、hbase、hdfs、kafka、mongo、mysql\n'
+            f'{NORM_CYN}\t[[-]-output< /=>output directory] 生成的ini文件存储位置,可以是绝对路径或相对路径'
+            f'{DO_RESET}'
+        )
+    if from_system == "hdfs":
+        print(
+            f'{NORM_MGT}Parameters when from is HDFS: \n'
+            f'{NORM_GRN}\t<[-]-d< /=>database>              Hive数据库(默认crl_mg)\n'
+            f'{NORM_GRN}\t<[-]-t< /=>table>                 Hive数据表(可多次传入,不传扫描-D传入的库中所有表)\n'
+            f'{NORM_GRN}\t<[-]-e< /=>exclude>               忽略的Hive数据表(可多次传入)\n'
+            f'{NORM_CYN}\t[[-]-partitioned]                 是否是分区表(默认为是),目前只支持日分区\n'
+            f'{DO_RESET}'
+        )
+    elif from_system == "mysql":
+        print(
+            f'{NORM_MGT}Parameters when from is MySQL: \n'
+            f'{NORM_GRN}\t<[-]-h< /=>host>                  MySQL主机\n'
+            f'{NORM_CYN}\t[[-]-P< /=>port]                  MySQL端口\n'
+            f'{NORM_GRN}\t<[-]-u< /=>username>              MySQL用户\n'
+            f'{NORM_GRN}\t<[-]-p< /=>password>              MySQL密码\n'
+            f'{NORM_GRN}\t<[-]-D< /=>database>              MySQL数据库\n'
+            f'{NORM_GRN}\t<[-]-tr< /=>table>                MySQL数据表正则(可多次传入,优先级高于t)\n'
+            f'{NORM_GRN}\t<[-]-t< /=>table>                 MySQL数据表(可多次传入)\n'
+            f'{NORM_GRN}\t<[-]-er< /=>exclude>              忽略的MySQL数据表正则(可多次传入,优先级高于e)\n'
+            f'{NORM_GRN}\t<[-]-e< /=>exclude>               忽略的MySQL数据表(可多次传入)\n'
+            f'{NORM_CYN}\t[[-]-inc-col]                     增量抽取字段名称(默认update_time)'
+            f'{DO_RESET}'
+        )
+    if to_system == "elasticsearch":
+        print(
+            f'{NORM_MGT}Parameters when to is Elasticsearch: \n'
+            f'{NORM_CYN}\t[[-]-inc-col]                     增量抽取字段名称(默认update_time)'
+            f'{DO_RESET}'
+        )
+    elif to_system == "hbase":
+        print(
+            f'{NORM_MGT}Parameters when to is HBase: \n'
+            f'{NORM_CYN}\t[[-]-inc-col]                     增量抽取字段名称(默认update_time)'
+            f'{DO_RESET}'
+        )
+    elif to_system == "hdfs":
+        print(
+            f'{NORM_MGT}Parameters when to is HDFS: \n'
+            f'{NORM_CYN}\t[[-]-project< /=>project]         项目名称,如skb、bms\n'
+            f'{NORM_CYN}\t[[-]-layer< /=>dw-layer]          数据仓库分层,如ods、dwd\n'
+            f'{NORM_CYN}\t[[-]-env< /=>dw-env]              数据仓库分层,如test\n'
+            f'{NORM_CYN}\t[[-]-edition< /=>dw-edition]      数据仓库版本,如dl,dd,\n'
+            f'{NORM_CYN}\t[[-]-partitioned]                 是否是分区表(默认为是),目前只支持日分区'
+            f'{DO_RESET}'
+        )
+    elif to_system == "kafka":
+        print(
+            f'{NORM_MGT}Parameters when to is Kafka: \n'
+            f'{NORM_GRN}\t<[-]-T< /=>kafka topic>           Kafka Topic\n'
+            f'{NORM_GRN}\t<[-]-K< /=>kafka key>             Kafka Key of data\n'
+            f'{DO_RESET}'
+        )
+    elif to_system == "mongo":
+        print(
+            f'{NORM_MGT}Parameters when to is Mongo: \n'
+            f'{NORM_CYN}\t[[-]-inc-col]                     增量抽取字段名称(默认update_time)'
+            f'{DO_RESET}'
+        )
+    elif to_system == "mysql":
+        print(
+            f'{NORM_MGT}Parameters when to is MySQL: \n'
+            f'{NORM_GRN}\t<[-]-h< /=>host>                  MySQL主机\n'
+            f'{NORM_CYN}\t[[-]-P< /=>port]                  MySQL端口\n'
+            f'{NORM_GRN}\t<[-]-u< /=>username>              MySQL用户\n'
+            f'{NORM_GRN}\t<[-]-p< /=>password>              MySQL密码\n'
+            f'{NORM_GRN}\t<[-]-D< /=>database>              MySQL数据库\n'
+            f'{NORM_GRN}\t<[-]-t< /=>table>                 MySQL数据表'
+            f'{DO_RESET}'
+        )
+    exit(code)
+
+
+def hdfs_elasticsearch_generator():
+    raise Exception(f'Not implemented yet with from {from_system} and to {to_system}')
+
+
+def hdfs_hbase_generator():
+    with SparkSQL('datax-gc-generator') as spark_sql:
+        hdfs_ds_name = 'hdfs-aliyun-cloud'
+        hbase_ds_name = 'hbase-default'
+        hdfs_path = '/user/hive/warehouse'
+        output = CONFIG.get('output', f'{base_dir}/ignored')
+        hive_database = CONFIG.get('d')
+        if not hive_database:
+            pretty_print(f'{NORM_RED}参数 {NORM_GRN}-d{DO_RESET}{NORM_RED} 未提供')
+            usage(1)
+        config_ini_path = '{0}/config/hdfs-hbase/{1}'.format(output, hive_database)
+        os.system(f'mkdir -p {config_ini_path}')
+        included_tables = CONFIG.get('t', [])
+        if isinstance(included_tables, str):
+            included_tables = [included_tables]
+        excluded_tables = CONFIG.get('e', [])
+        if isinstance(excluded_tables, str):
+            excluded_tables = [excluded_tables]
+        if len(included_tables) == 0 and len(excluded_tables) == 0:
+            pretty_print(f'{NORM_YEL}注意:'
+                         f'{NORM_MGT}参数 {NORM_GRN}-t{NORM_MGT} 或 {NORM_GRN}-e '
+                         f'{NORM_MGT}未提供,将扫描数据库 '
+                         f'{NORM_GRN}{hive_database}{DO_RESET} '
+                         f'{NORM_MGT}下所有表')
+        hbase_namespace = CONFIG.get('n')
+        if not hbase_namespace:
+            pretty_print(f'{NORM_YEL}注意:'
+                         f'{NORM_MGT}参数 {NORM_GRN}-n{DO_RESET} '
+                         f'{NORM_MGT}未提供,将使用 HBase 默认命名空间 '
+                         f'{NORM_GRN}default')
+            hbase_namespace = 'default'
+        final_tables = []
+        if len(included_tables) == 0:
+            all_hive_tables = []
+            # 如果没有传入仅需的表则获取数据库下所有表
+            tables_df, _ = spark_sql.query(f'show tables in {hive_database}', silent=True)
+            desc_tables = tables_df.collect()
+            for row in desc_tables:
+                all_hive_tables.append(row[1])
+            # 去除忽略的数据表
+            for hive_table in all_hive_tables:
+                if hive_table not in excluded_tables:
+                    final_tables.append(f'{hive_database}.{hive_table}')
+        else:
+            final_tables = included_tables
+        number_width = len(str(len(final_tables)))
+        index = 1
+        for hive_table_full_name in final_tables:
+            if hive_table_full_name.__contains__("."):
+                hive_db, hive_table_name = hive_table_full_name.split('.')
+            else:
+                hive_db = hive_database
+                hive_table_name = hive_table_full_name
+            # 获取表备注
+            create_table_str = spark_sql.query_scalar(f'show create table {hive_table_full_name}', silent=True)
+            hive_table_comment = create_table_str.split('ROW FORMAT SERDE')[0].split(")")[1]
+            if 'COMMENT' in hive_table_comment:
+                hive_table_comment = hive_table_comment.split("'")[1]
+            else:
+                hive_table_comment = ''
+            # 表是否有分区
+            partitioned = False
+            if "`dt`" in create_table_str:
+                partitioned = True
+            # 获取表列名
+            hive_columns = spark_sql.get_columns(f'{hive_table_full_name}')
+            hive_column_names = []
+            hive_column_types = {}
+            for column_name, column_value in hive_columns.items():
+                hive_column_names.append(column_name)
+                column_type = column_value[0].upper()
+                if column_type == 'BIGINT':
+                    column_type = 'LONG'
+                hive_column_types[column_name] = column_type
+            hdfs_reader_str = HDFSReader.generate_definition(hdfs_ds_name, hdfs_path, hive_db, hive_table_name,
+                                                             hive_table_comment,
+                                                             partitioned,
+                                                             hive_column_names, hive_column_types)
+            # hbase列名
+            hbase_column_names = []
+            for column_name in hive_column_names:
+                hbase_column_names.append(f'cf:{column_name}')
+            row_key_columns = []
+            row_key_columns.append('reverse(主键如果是自增ID,建议reverse)')
+            row_key_columns.append('separator(@@)')
+            row_key_columns.append(f'separator({hive_db})')
+            row_key_columns.append(f'separator(.)')
+            row_key_columns.append(f'separator({hive_table_name})')
+            hbase_writer_str = HBaseWriter.generate_definition(hbase_ds_name,
+                                                               hbase_namespace,
+                                                               hive_table_name,
+                                                               hive_table_name,
+                                                               hive_table_comment,
+                                                               "cf",
+                                                               hbase_column_names,
+                                                               {},
+                                                               row_key_columns)
+            # 写文件
+            generator_config_file = '{0}/hdfs-hbase-{1}-{2}.ini'.format(config_ini_path, hbase_namespace, hive_table_name)
+            write_file(f'{hdfs_reader_str}\n\n{hbase_writer_str}\n', generator_config_file)
+            pretty_print(
+                f'{NORM_YEL}{str(index).rjust(number_width, " ")}. {NORM_MGT} 读 Hive 数据库表 '
+                f'{NORM_GRN}{hive_database}.{hive_table_name}  '
+                f'{NORM_MGT}\n                    写 HBase DataX作业配置文件生成器配置已写入文件 '
+                f'{NORM_GRN}{generator_config_file}'
+            )
+            index += 1
+
+
+def hdfs_kafka_generator():
+    kafka_topic = CONFIG.get('T')
+    kafka_key = CONFIG.get('K')
+    if not kafka_topic:
+        pretty_print(f'{NORM_RED}参数 {NORM_GRN}-T{DO_RESET}{NORM_RED} 未提供')
+        usage(1)
+    included_tables = CONFIG.get('t', [])
+    if isinstance(included_tables, str):
+        included_tables = [included_tables]
+    excluded_tables = CONFIG.get('e', [])
+    if isinstance(excluded_tables, str):
+        excluded_tables = [excluded_tables]
+    hive_database = CONFIG.get('d')
+    if len(included_tables) == 0 and len(excluded_tables) == 0:
+        if not hive_database:
+            pretty_print(f'{NORM_RED}参数 {NORM_GRN}-d{DO_RESET}{NORM_RED} 未提供')
+            usage(1)
+        else:
+            pretty_print(f'{NORM_YEL}注意:'
+                         f'{NORM_MGT}参数 {NORM_GRN}-t{NORM_MGT} 或 {NORM_GRN}-e '
+                         f'{NORM_MGT}未提供,将扫描数据库 '
+                         f'{NORM_GRN}{hive_database}{DO_RESET} '
+                         f'{NORM_MGT}下所有表')
+    with SparkSQL('datax-gc-generator') as spark_sql:
+        hdfs_ds_name = 'hdfs-aliyun-cloud'
+        hdfs_path = '/user/hive/warehouse'
+        kafka_ds_name = 'kafka-aliyun'
+        output = CONFIG.get('output', f'{base_dir}/ignored')
+        final_tables = []
+        if len(included_tables) == 0:
+            all_hive_tables = []
+            # 如果没有传入仅需的表则获取数据库下所有表
+            tables_df, _ = spark_sql.query(f'show tables in {hive_database}', silent=True)
+            desc_tables = tables_df.collect()
+            for row in desc_tables:
+                all_hive_tables.append(row[1])
+            # 去除忽略的数据表
+            for hive_table in all_hive_tables:
+                if hive_table not in excluded_tables:
+                    final_tables.append(f'{hive_database}.{hive_table}')
+        else:
+            final_tables = included_tables
+        number_width = len(str(len(final_tables)))
+        index = 1
+        # 判断是否为外部表,若为外部表,则获取表信息内容改变
+        for table_name in final_tables:
+            # if hive_table_full_name.endswith("es_mapping"):
+            #     continue
+            # hive_database, hive_table_name = hive_table_full_name.split('.')  # type: str,str
+            config_ini_path = '{0}/config/hdfs-kafka/{1}'.format(output, hive_database)
+            os.system(f'mkdir -p {config_ini_path}')
+            hive_table_name = ''
+            # 如果传入的配置中(-t=*)是完整的库名+表名(crl_es.xxx),则通过完整的库名表名获取信息
+            if table_name.__contains__('.'):
+                hive_database, hive_table_name = table_name.split('.')  # type: str,str
+                hive_table_full_name = table_name
+            # 如果传入的配置中是表名,则通过传入的配置获取库信息
+            elif hive_database is not None:
+                hive_table_name = table_name  # type: str
+                hive_table_full_name = f'{hive_database}.{hive_table_name}'
+            else:
+                raise ValueError("hive_database undefined")
+            hive_column_names = []
+            hive_column_types = {}
+            hive_table_names_mapping = {}
+            # 获取表备注
+            if hive_table_full_name.__contains__('mapping'):
+                mysql_handler = MySQLHandler(
+                    host='rm-m5er2i6wz605su9bi.mysql.rds.aliyuncs.com',
+                    port=3306,
+                    username='meta_ro',
+                    password='Ts#r5rO1'
+                )
+                partitioned = True  # 如果是mapping表则默认有分区,因为只是通过mapping表看映射
+                hive_table_tbl = dict(
+                    mysql_handler.query_tbl_hive_metadata(hive_table_name))  # 获取hive的mapping表全部的tblproperties
+                hive_table_comment = hive_table_tbl.get('comment')  # 获取hive表comment字段
+                hive_table_names_mapping = hive_table_tbl.get('es.mapping.names')  # 获取hive与es映射的字段信息
+                hive_table_es_index = hive_table_tbl.get('es.resource').split('/')[0]  # 获取写入es的索引
+                hive_table_column = mysql_handler.query_column_hive_metadata(hive_table_name)  # 获取hive表的列名、字段类型、注释等数据
+                hive_columns = {column[1]: (column[2], column[3]) for column in
+                                hive_table_column}  # 将读取到的列名、类型、注释转换成与spark读取出来的格式一致
+                kafka_column_types = {column[1]: column[2] for column in hive_table_column}  # 单独列出一个{列名:类型}的字典
+                hdfs_reader_column = [mappingName.split(':')[0] for mappingName in
+                                      str(hive_table_names_mapping).split(',')]  # 在hive与es映射字段的表内取出要读hdfs的字段
+                kafka_writer_column = [mappingName.split(':')[1] for mappingName in
+                                       str(hive_table_names_mapping).split(',')]  # 在hive与es映射字段的表内取出要写kafka的字段
+                column_mapping = dict(item.split(":") for item in hive_table_names_mapping.split(","))  # 将hive与es字段的映射转换为字典
+
+                for k, v in dict(
+                        item.split(":") for item in hive_table_names_mapping.split(",")).items():  # 保存一个不包含struct结构体映射的字典
+                    if k not in hive_columns.keys():
+                        column_mapping.pop(k)
+
+                for column in hdfs_reader_column.copy():  # HDFS reader中去掉结构体的列名
+                    if column not in hive_columns.keys():
+                        hdfs_reader_column.remove(column)
+
+                for column in kafka_writer_column.copy():  # kafka reader中去掉结构体的列名
+                    if column not in column_mapping.values():
+                        kafka_writer_column.remove(column)
+
+                hive_es_column_mapping = {hdfs_reader_column[i]: kafka_writer_column[i] for i in
+                                          range(len(hdfs_reader_column))}
+
+                for key, value in kafka_column_types.copy().items():  # 替换kafka writer字典中的键
+                    if key in hive_es_column_mapping:
+                        kafka_column_types[hive_es_column_mapping[key]] = kafka_column_types.pop(key)
+                    if key not in hive_es_column_mapping:
+                        kafka_column_types.pop(key)
+
+                hive_column_types = {}
+                column_type_flag = ['LONG', 'BIGINT', 'BOOLEAN', 'STRING', 'DOULBE']
+                for column_name, column_value in hive_columns.items():
+                    hive_column_names.append(column_name)
+                    column_type = column_value[0].upper()
+                    if column_type in column_type_flag:
+                        hive_column_types[column_name] = column_type
+                    else:
+                        if column_type == 'INT':
+                            hive_column_types[column_name] = 'LONG'
+
+                hive_column_names = hdfs_reader_column
+                hive_table_name = re.sub(r"({}).*$".format(re.escape("sum")), "sum", hive_table_name)
+                if kafka_key is None:
+                    kafka_key = hive_table_es_index
+            else:
+                create_table_str = spark_sql.query_scalar(f'show create table {hive_table_full_name}', silent=True)
+                hive_table_comment = create_table_str.split('ROW FORMAT SERDE')[0].split(")")[1]
+                if 'COMMENT' in hive_table_comment:
+                    hive_table_comment = hive_table_comment.split("'")[1]
+                else:
+                    hive_table_comment = ''
+                # 表是否有分区
+                partitioned = False
+                if "`dt`" in create_table_str:
+                    partitioned = True
+                # 获取表列名
+                hive_columns = spark_sql.get_columns(f'{hive_table_full_name}')
+
+                column_type_flag = ['LONG', 'BIGINT', 'BOOLEAN', 'STRING', 'DOULBE']
+                for column_name, column_value in hive_columns.items():
+                    hive_column_names.append(column_name)
+                    column_type = column_value[0].upper()
+                    if column_type in column_type_flag:
+                        hive_column_types[column_name] = column_type
+                    else:
+                        if column_type == 'INT':
+                            hive_column_types[column_name] = 'LONG'
+
+                kafka_writer_column = hive_column_names
+                kafka_column_types = hive_column_types
+
+            hdfs_reader_str = HDFSReader.generate_definition(hdfs_ds_name, hdfs_path, hive_database, hive_table_name,
+                                                             hive_table_comment,
+                                                             partitioned,
+                                                             hive_column_names, hive_column_types)
+
+            source_name = hive_table_name.replace('es_crl_', '').replace('_sum', '')
+            kafka_writer_str = KafkaWriter.generate_definition(kafka_ds_name, kafka_topic, kafka_key, source_name,
+                                                               kafka_writer_column, kafka_column_types,
+                                                               hive_table_names_mapping)
+
+            # 写文件
+            generator_config_file = '{0}/hdfs-kafka-{1}-{2}.ini'.format(config_ini_path, hive_database, hive_table_name)
+            write_file(f'{hdfs_reader_str}\n\n{kafka_writer_str}\n', generator_config_file)
+            pretty_print(
+                f'{NORM_YEL}{str(index).rjust(number_width, " ")}. {NORM_MGT} 读 Hive 数据库表 '
+                f'{NORM_GRN}{hive_database}.{hive_table_name}'
+                f'{NORM_MGT}\n                    写 Kafka DataX作业配置文件生成器配置已写入文件 '
+                f'{NORM_GRN}{generator_config_file}'
+            )
+            index += 1
+
+
+def hdfs_mongo_generator():
+    hive_database = CONFIG.get('d')
+    if not hive_database:
+        pretty_print(f'{NORM_RED}参数 {NORM_GRN}-d{DO_RESET}{NORM_RED} 未提供')
+        usage(1)
+    included_tables = CONFIG.get('t', [])
+    if isinstance(included_tables, str):
+        included_tables = [included_tables]
+    excluded_tables = CONFIG.get('e', [])
+    if isinstance(excluded_tables, str):
+        excluded_tables = [excluded_tables]
+    if len(included_tables) == 0 and len(excluded_tables) == 0:
+        pretty_print(f'{NORM_YEL}注意:'
+                     f'{NORM_MGT}参数 {NORM_GRN}-t{NORM_MGT} 或 {NORM_GRN}-e '
+                     f'{NORM_MGT}未提供,将扫描数据库 '
+                     f'{NORM_GRN}{hive_database}{DO_RESET} '
+                     f'{NORM_MGT}下所有表')
+    with SparkSQL('datax-gc-generator') as spark_sql:
+        # 获取json文件中表的主键
+        table_pk_fields = []
+        for validation_config_file in list_files('conf/validation', True):
+            if validation_config_file.endswith('.json'):
+                validation_config = load_json_file(validation_config_file)
+                dwd_table_config = validation_config.get('dwd_table')  # type: str
+                if not dwd_table_config:
+                    pretty_print(f'{NORM_YEL}文件 {NORM_GRN}{validation_config_file}'
+                                 f'{NORM_YEL} 中没有发现 {NORM_GRN}dwd_table{NORM_YEL} 的定义')
+                    continue
+                # 表名后缀,如:court_announcement
+                table_name = dwd_table_config.split("_crl_")[1]
+                ods_dwd_config = validation_config.get('ods_dwd_config')
+                if not ods_dwd_config:
+                    continue
+                # 获取主键名
+                new_pk_fields = ods_dwd_config.get('pk_fields')  # type:list
+                # 放入到字典中
+                for pk_fields in new_pk_fields:
+                    table_pk_fields.append(table_name + "-" + pk_fields)
+        hdfs_ds_name = 'hdfs-aliyun-cloud'
+        hdfs_path = '/user/hive/warehouse'
+        output = CONFIG.get('output', f'{base_dir}/ignored')
+        config_ini_path = '{0}/config/hdfs-mongo/{1}'.format(output, hive_database)
+        os.system(f'mkdir -p {config_ini_path}')
+        final_tables = []
+        if len(included_tables) == 0:
+            all_hive_tables = []
+            # 如果没有传入仅需的表则获取数据库下所有表
+            tables_df, _ = spark_sql.query(f'show tables in {hive_database}', silent=True)
+            desc_tables = tables_df.collect()
+            for row in desc_tables:
+                all_hive_tables.append(row[1])
+            # 去除忽略的数据表
+            for hive_table in all_hive_tables:
+                if hive_table not in excluded_tables:
+                    final_tables.append(f'{hive_database}.{hive_table}')
+        else:
+            final_tables = included_tables
+        number_width = len(str(len(final_tables)))
+        index = 1
+        for hive_table_full_name in final_tables:
+            hive_database, hive_table_name = hive_table_full_name.split('.')
+            # 获取表备注
+            create_table_str = spark_sql.query_scalar(f'show create table {hive_table_full_name}', silent=True)
+            hive_table_comment = create_table_str.split('ROW FORMAT SERDE')[0].split(")")[1]
+            if 'COMMENT' in hive_table_comment:
+                hive_table_comment = hive_table_comment.split("'")[1]
+            else:
+                hive_table_comment = ''
+            # 表是否有分区
+            partitioned = False
+            if "`dt`" in create_table_str:
+                partitioned = True
+            # 获取表列名
+            hive_columns = spark_sql.get_columns(f'{hive_table_full_name}')
+            hive_column_names = []
+            hive_column_types = {}
+            for column_name, column_value in hive_columns.items():
+                hive_column_names.append(column_name)
+                column_type = column_value[0].upper()
+                if column_type == 'BIGINT':
+                    column_type = 'LONG'
+                hive_column_types[column_name] = column_type
+            hdfs_reader_str = HDFSReader.generate_definition(hdfs_ds_name, hdfs_path, hive_database, hive_table_name,
+                                                             hive_table_comment,
+                                                             partitioned,
+                                                             hive_column_names, hive_column_types)
+            mongo_ds_name = 'mongo-dev-rw'
+            if hive_table_full_name.__contains__('_crl_'):
+                split_table_name = hive_table_full_name.split("_crl_")[1]
+            else:
+                split_table_name = 'unknown'
+            mongo_database = 'enterprise'
+            mongo_collection = snake_case_to_pascal_case(split_table_name)
+            # mongo列名
+            mongo_column_names = []
+            for column_name in hive_column_names:
+                if MONGO_SPECIAL_WORDS_DICT.__contains__(column_name):
+                    mongo_column_names.append(MONGO_SPECIAL_WORDS_DICT[column_name])
+                else:
+                    column_name = snake_case_to_camel_case(column_name)
+                    mongo_column_names.append(column_name)
+            # mongo列类型和主键
+            mongo_column_types = {}
+            pk_fields = []
+            for column_name, column_type in hive_column_types.items():
+                is_pk = False
+                if split_table_name + "-" + column_name in table_pk_fields:
+                    is_pk = True
+                if MONGO_SPECIAL_WORDS_DICT.__contains__(column_name):
+                    column_name = MONGO_SPECIAL_WORDS_DICT[column_name]
+                else:
+                    column_name = snake_case_to_camel_case(column_name)
+                if column_name == 'createDate' and column_type == 'LONG':
+                    mongo_column_types[column_name] = 'DATE'
+                elif column_name == 'updateDate' and column_type == 'LONG':
+                    mongo_column_types[column_name] = 'DATE'
+                else:
+                    mongo_column_types[column_name] = column_type
+                if is_pk:
+                    pk_fields.append(column_name)
+
+            mongo_writer_str = MongoWriter.generate_definition(mongo_ds_name, mongo_database, mongo_collection,
+                                                               mongo_column_names,
+                                                               mongo_column_types, pk_fields)
+            # 写文件
+            generator_config_file = '{0}/hdfs-mongo-{1}-{2}.ini'.format(config_ini_path, mongo_database, split_table_name)
+            write_file(f'{hdfs_reader_str}\n\n{mongo_writer_str}\n', generator_config_file)
+            pretty_print(
+                f'{NORM_YEL}{str(index).rjust(number_width, " ")}. {NORM_MGT} 读 Hive 数据库表 '
+                f'{NORM_GRN}{hive_database}.{hive_table_name}'
+                f'{NORM_MGT}\n                    写 MongoDB DataX作业配置文件生成器配置已写入文件 '
+                f'{NORM_GRN}{generator_config_file}'
+            )
+            index += 1
+
+
+def hdfs_mysql_generator():
+    raise Exception(f'Not implemented yet with from {from_system} and to {to_system}')
+
+
+def mysql_hdfs_generator():
+    host = CONFIG.get('h')
+    port = int(CONFIG.get('P', '3306'))
+    username = CONFIG.get('u')
+    password = CONFIG.get('p')
+    mysql_database = CONFIG.get('D')
+    tables = CONFIG.get('t', [])
+    if isinstance(tables, str):
+        tables = [tables]
+    excluded_tables = CONFIG.get('e', [])
+    if isinstance(excluded_tables, str):
+        excluded_tables = [excluded_tables]
+    table_regex = CONFIG.get('tr', [])
+    if isinstance(table_regex, str):
+        table_regex = [table_regex]
+    exclude_regex = CONFIG.get('er', [])
+    if isinstance(exclude_regex, str):
+        exclude_regex = [exclude_regex]
+    project = CONFIG.get('project')
+    layer = CONFIG.get('layer', 'ods')
+    edition = CONFIG.get('edition')
+    env = CONFIG.get('env')
+    partitioned = CONFIG.get('partitioned', False)
+    inc_col = CONFIG.get('inc-col', 'update_time')
+    if not (host and username and password and mysql_database):
+        usage(1)
+    hdfs_ds_name = 'hdfs-aliyun-cloud'
+    hdfs_default_fs = 'hdfs://cluster'
+    hdfs_path = '/user/hive/warehouse'
+    output = CONFIG.get('output', f'{base_dir}/ignored')
+    hive_database = get_hive_database_name(project, layer, env)
+    hive_table_prefix = get_hive_table_prefix(project, layer, edition)
+    mysql_datasource_path = '{0}/datasource/mysql/{1}'.format(output, hive_database)
+    hdfs_datasource_path = '{0}/datasource/hdfs'.format(output)
+    config_ini_path = '{0}/config/mysql-hdfs/{1}'.format(output, hive_database)
+    hive_ddl_path = '{0}/ddl'.format(output)
+    hive_ddl_file = '{0}/{1}.sql'.format(hive_ddl_path, mysql_database)
+    os.system(f'mkdir -p {mysql_datasource_path}')
+    os.system(f'mkdir -p {hdfs_datasource_path}')
+    os.system(f'mkdir -p {config_ini_path}')
+    os.system(f'mkdir -p {hive_ddl_path}')
+    mysql_ds_def = MySQLDataSource.generate_definition(host, port, username, password, mysql_database)
+    hdfs_ds_def = HDFSDataSource.generate_definition(hdfs_default_fs)
+    mysql_handler = MySQLHandler(host, port, username, password)
+    mysql_tables = mysql_handler.list_tables(mysql_database, exclude_regex, table_regex)
+    number_width = len(str(len(mysql_tables)))
+    index = 1
+    if os.path.exists(hive_ddl_file):
+        hive_ddl_file = '{0}/{1}-{2}.sql'.format(hive_ddl_path, mysql_database, str(int(time.time())))
+    write_file('CREATE DATABASE IF NOT EXISTS %s;\n' % hive_database, hive_ddl_file)
+    for mysql_table_name, mysql_table_comment in mysql_tables.items():
+        if tables and not tables.__contains__(mysql_table_name):
+            continue
+        if excluded_tables and excluded_tables.__contains__(mysql_table_name):
+            continue
+        mysql_column_list = mysql_handler.list_columns(mysql_database, mysql_table_name)
+        mysql_column_names = [c.COLUMN_NAME for c in mysql_column_list]
+        column_types = convert_mysql_column_types(mysql_column_list)
+        mysql_reader_def = MySQLReader.generate_definition(
+            mysql_database,
+            mysql_table_name, mysql_table_comment,
+            mysql_column_names, column_types,
+            hive_database, partitioned, inc_col
+        )
+
+        hive_table_name = f'{hive_table_prefix}_{mysql_table_name}'
+
+        hive_ddl_def = MySQLReader.generate_hive_ddl(
+            hive_database, hive_table_name, mysql_table_comment,
+            partitioned,
+            mysql_column_list, column_types
+        )
+        hdfs_writer_def = HDFSWriter.generate_definition(
+            hdfs_ds_name, hdfs_path,
+            hive_database, hive_table_name, partitioned,
+            mysql_column_names, column_types
+        )
+        generator_config_file = '{0}/mysql-hdfs-{1}-{2}.ini'.format(config_ini_path, mysql_database, mysql_table_name)
+        write_file(f'{mysql_reader_def}\n{hdfs_writer_def}\n', generator_config_file)
+        append_file(f'{hive_ddl_def}\n', hive_ddl_file)
+        pretty_print(
+            f'{NORM_YEL}{str(index).rjust(number_width, " ")}{NORM_MGT} 读 MySQL 数据库表 '
+            f'{NORM_GRN}{mysql_database}.{mysql_table_name}'
+            f'{NORM_MGT}\n                    写 HDFS DataX作业配置文件生成器配置已写入文件 '
+            f'{NORM_GRN}{generator_config_file}'
+            f'{NORM_MGT}\n                    对应的Hive建表DDL已写入文件 '
+            f'{NORM_GRN}{hive_ddl_file}'
+        )
+        index += 1
+        # break
+    mysql_ds_def_file = '{0}/mysql-{1}.ini'.format(mysql_datasource_path, mysql_database)
+    if os.path.exists(mysql_ds_def_file):
+        mysql_ds_def_file = '{0}/mysql-{1}-{2}.ini'.format(mysql_datasource_path, mysql_database, str(int(time.time())))
+    write_file(f'{mysql_ds_def}\n', mysql_ds_def_file)
+    pretty_print(f'{NORM_MGT}MySQL数据库 {NORM_GRN}{mysql_database}{NORM_MGT} 数据源配置已写入文件 {NORM_GRN}{mysql_ds_def_file}')
+    hdfs_ds_def_file = '{0}/{1}.ini'.format(hdfs_datasource_path, hdfs_ds_name)
+    if os.path.exists(hdfs_ds_def_file):
+        hdfs_ds_def_file = '{0}/{1}-{2}.ini'.format(hdfs_datasource_path, hdfs_ds_name, str(int(time.time())))
+    write_file(f'{hdfs_ds_def}\n', hdfs_ds_def_file)
+    pretty_print(f'{NORM_MGT}HDFS 数据源配置已写入文件 {NORM_GRN}{hdfs_ds_def_file}')
+
+
+def mysql_hbase_generator():
+    host = CONFIG.get('h')
+    port = int(CONFIG.get('P', '3306'))
+    username = CONFIG.get('u')
+    password = CONFIG.get('p')
+    mysql_database = CONFIG.get('D')
+    if not (host and username and password and mysql_database):
+        usage(1)
+    included_tables = CONFIG.get('t', [])
+    if isinstance(included_tables, str):
+        included_tables = [included_tables]
+    excluded_tables = CONFIG.get('e', [])
+    if isinstance(excluded_tables, str):
+        excluded_tables = [excluded_tables]
+    table_regex = CONFIG.get('tr', [])
+    if isinstance(table_regex, str):
+        table_regex = [table_regex]
+    exclude_regex = CONFIG.get('er', [])
+    if isinstance(exclude_regex, str):
+        exclude_regex = [exclude_regex]
+    if len(included_tables) == 0 and len(excluded_tables) == 0 and len(table_regex) == 0 and len(exclude_regex) == 0:
+        pretty_print(f'{NORM_YEL}注意:'
+                     f'{NORM_MGT}参数 {NORM_GRN}-t -e -tr -er '
+                     f'{NORM_MGT}都未提供,将扫描数据库 '
+                     f'{NORM_GRN}{mysql_database}{DO_RESET} '
+                     f'{NORM_MGT}下所有表')
+    hbase_namespace = CONFIG.get('n')
+    if not hbase_namespace:
+        pretty_print(f'{NORM_YEL}注意:'
+                     f'{NORM_MGT}参数 {NORM_GRN}-n{DO_RESET} '
+                     f'{NORM_MGT}未提供,将使用 HBase 默认命名空间 '
+                     f'{NORM_GRN}default')
+        hbase_namespace = 'default'
+    project = CONFIG.get('project')
+    layer = CONFIG.get('layer', 'ods')
+    edition = CONFIG.get('edition')
+    env = CONFIG.get('env')
+    partitioned = CONFIG.get('partitioned', False)
+    inc_col = CONFIG.get('inc-col', 'update_time')
+    hbase_ds_name = 'hbase-default'
+    hdfs_default_fs = 'hdfs://cluster'
+    output = CONFIG.get('output', f'{base_dir}/ignored')
+    hive_database = get_hive_database_name(project, layer, env)
+    hive_table_prefix = get_hive_table_prefix(project, layer, edition)
+    mysql_datasource_path = '{0}/datasource/mysql/{1}'.format(output, hive_database)
+    config_ini_path = '{0}/config/mysql-hbase/{1}'.format(output, hive_database)
+    hive_ddl_path = '{0}/ddl'.format(output)
+    hive_ddl_file = '{0}/{1}.sql'.format(hive_ddl_path, mysql_database)
+    os.system(f'mkdir -p {mysql_datasource_path}')
+    os.system(f'mkdir -p {config_ini_path}')
+    os.system(f'mkdir -p {hive_ddl_path}')
+    mysql_ds_def = MySQLDataSource.generate_definition(host, port, username, password, mysql_database)
+    mysql_handler = MySQLHandler(host, port, username, password)
+    mysql_tables = mysql_handler.list_tables(mysql_database, exclude_regex, table_regex)
+    number_width = len(str(len(mysql_tables)))
+    index = 1
+    if os.path.exists(hive_ddl_file):
+        hive_ddl_file = '{0}/{1}-{2}.sql'.format(hive_ddl_path, mysql_database, str(int(time.time())))
+    write_file('CREATE DATABASE IF NOT EXISTS %s;\n' % hive_database, hive_ddl_file)
+    for mysql_table_name, mysql_table_comment in mysql_tables.items():
+        if included_tables and not included_tables.__contains__(mysql_table_name):
+            continue
+        if excluded_tables and excluded_tables.__contains__(mysql_table_name):
+            continue
+        mysql_column_list = mysql_handler.list_columns(mysql_database, mysql_table_name)
+        mysql_column_names = [c.COLUMN_NAME for c in mysql_column_list]
+        column_types = convert_mysql_column_types(mysql_column_list)
+        mysql_reader_def = MySQLReader.generate_definition(
+            mysql_database,
+            mysql_table_name, mysql_table_comment,
+            mysql_column_names, column_types,
+            hive_database, partitioned, inc_col
+        )
+
+        hive_table_name = f'{hive_table_prefix}_{mysql_table_name.lower()}_hbase_mapping'
+        hbase_table_name = mysql_table_name.lower()
+
+        hive_over_hbase_ddl_def = MySQLReader.generate_hive_over_hbase_ddl(
+            hive_database, hive_table_name, mysql_table_comment, hbase_namespace, hbase_table_name,
+            mysql_column_list, column_types
+        )
+        row_key_columns = []
+        row_key_columns.append('reverse(主键如果是自增ID,建议reverse)')
+        row_key_columns.append('separator(@@)')
+        row_key_columns.append(f'separator({mysql_database})')
+        row_key_columns.append(f'separator(.)')
+        row_key_columns.append(f'separator({mysql_table_name})')
+        hdfs_writer_def = HBaseWriter.generate_definition(
+            hbase_ds_name, hbase_namespace, hbase_table_name,
+            mysql_table_name, mysql_table_comment, 'cf',
+            mysql_column_names, column_types, row_key_columns
+        )
+        generator_config_file = '{0}/mysql-hbase-{1}-{2}.ini'.format(config_ini_path, mysql_database, mysql_table_name)
+        write_file(f'{mysql_reader_def}\n{hdfs_writer_def}\n', generator_config_file)
+        append_file(f'{hive_over_hbase_ddl_def}\n', hive_ddl_file)
+        pretty_print(
+            f'{NORM_YEL}{str(index).rjust(number_width, " ")}. {NORM_MGT} 读 MySQL 数据库表 '
+            f'{NORM_GRN}{mysql_database}.{mysql_table_name}'
+            f'{NORM_MGT}\n                    写 HBase DataX作业配置文件生成器配置已写入文件 '
+            f'{NORM_GRN}{generator_config_file}'
+            f'{NORM_MGT}\n                    对应的 Hive Over HBase 建表DDL已写入文件 '
+            f'{NORM_GRN}{hive_ddl_file}'
+        )
+        index += 1
+        # break
+    mysql_ds_def_file = '{0}/mysql-{1}.ini'.format(mysql_datasource_path, mysql_database)
+    if os.path.exists(mysql_ds_def_file):
+        mysql_ds_def_file = '{0}/mysql-{1}-{2}.ini'.format(mysql_datasource_path, mysql_database, str(int(time.time())))
+    write_file(f'{mysql_ds_def}\n', mysql_ds_def_file)
+    pretty_print(f'{NORM_MGT}MySQL数据库 {NORM_GRN}{mysql_database}{NORM_MGT} 数据源配置已写入文件 {NORM_GRN}{mysql_ds_def_file}')
+
+
+if __name__ == '__main__':
+    pretty_print(f'{NORM_MGT}{sys.argv[0]} 收到参数:{NORM_GRN}{" ".join(sys.argv[1:])}')
+    CONFIG, _ = parse_args(sys.argv[1:])
+    # 未提供任何参数或查看帮助
+    if exist(CONFIG, ['H', 'help']):
+        usage(0)
+    from_system = CONFIG.get('from')
+    if from_system:
+        from_system = from_system.lower()
+    to_system = CONFIG.get('to')
+    if to_system:
+        to_system = to_system.lower()
+    if not from_system or not to_system:
+        usage(1)
+    if from_system == "hdfs":
+        if to_system == "elasticsearch":
+            hdfs_elasticsearch_generator()
+        elif to_system == "hbase":
+            hdfs_hbase_generator()
+        elif to_system == "kafka":
+            hdfs_kafka_generator()
+        elif to_system == "mongo":
+            hdfs_mongo_generator()
+        elif to_system == "mysql":
+            hdfs_mysql_generator()
+        else:
+            raise Exception(f'Not implemented yet with from {from_system} and to {to_system}')
+    elif from_system == "mysql":
+        if to_system == "hbase":
+            mysql_hbase_generator()
+        elif to_system == "hdfs":
+            mysql_hdfs_generator()
+        else:
+            raise Exception(f'Not implemented yet with from {from_system} and to {to_system}')
+    else:
+        raise Exception(f'Not implemented yet with from {from_system} and to {to_system}')

+ 142 - 0
bin/datax-job-config-generator.py

@@ -0,0 +1,142 @@
+#!/usr/bin/env /usr/bin/python3
+# -*- coding:utf-8 -*-
+"""
+  读取定义在`conf/datax/config/${源类型}-${目标类型}/${项目}-${分层}-${Hive环境}[/数据库环境[/数据分组]]/${源类型}-${目标类型}-${源库名称}-${源表名称}.ini`中的配置,
+  以及在上述配置中定义、存储于`conf/datax/datasource/${ds-type}/${project}-${layer}-${env}`的`${ds-type}-${ds-name}.ini`中的
+  `reader`和`writer`,生成DataX作业配置文件。
+  若未提供`-output`来指定路径存储生成的DataX作业配置文件,则会默认将生成的DataX作业配置文件存储于`conf/datax/generated`中。
+"""
+import os
+import sys
+
+project_root_dir = os.path.dirname(os.path.dirname(os.path.abspath(__file__)))
+sys.path.append(project_root_dir)
+from dw_base import DO_RESET, NORM_CYN, NORM_GRN, NORM_MGT, NORM_RED, NORM_YEL
+from dw_base.common.config_constants import K_CONFIG_FILE, K_DIRECTORY, KL_HELP
+from dw_base.datax.job_config_generator import JobConfigGenerator
+from dw_base.utils.common_utils import exist
+from dw_base.utils.config_utils import parse_args
+from dw_base.utils.datetime_utils import get_yesterday, get_today
+from dw_base.utils.file_utils import list_files, get_abs_path
+from dw_base.utils.log_utils import pretty_print
+
+
+def usage(code: int):
+    print(
+        f'{NORM_MGT}Usage: {sys.argv[0]}\n'
+        f'{NORM_CYN}\t[-h/-H/--h/--H/--help]           打印脚本使用方法{DO_RESET}'
+    )
+    print(
+        f'{NORM_MGT}Usage: {sys.argv[0]}\n'
+        f'{NORM_GRN}\t<[-]-c< /=>job config>           DataX作业配置生成器配置文件路径,可以多次传入-c/--c或以逗号分隔的形式\n'
+        f'{NORM_GRN}\t                                 传入多个,传多个时参数start-date和stop-date共同使用\n'
+        f'{NORM_GRN}\t<[-]-d< /=>job config directory> 扫描指定路径下所有DataX作业配置生成器配置\n'
+        f'{NORM_GRN}\t                                 可以多次传入-d/--d或以逗号分隔的形式传入多个,优先级低于-c/--c\n'
+        f'{NORM_GRN}\t<[-]-r>                          递归扫描指定路径下所有DataX作业配置生成器配置,与-d/--d配合使用\n'
+        f'{NORM_CYN}\t[[-]-start-date< /=>start date]  yyyyMMdd[-/-yyyyMMdd]格式表达的日期(或日期范围)\n'
+        f'{NORM_CYN}\t[[-]-stop-date< /=>stop date]    yyyyMMdd[-/-yyyyMMdd]格式表达的日期(或日期范围)\n'
+        f'{NORM_CYN}\t[[-]-o< /=>output path]          DataX作业配置输出文件夹(绝对路径)'
+        f'{DO_RESET}'
+    )
+    exit(code)
+
+
+def collect_generate_config_files():
+    generator_config_files = set()
+    if CONFIG.__contains__(K_CONFIG_FILE):
+        # 传递了`DataX作业配置生成器配置.ini文件`
+        generator_config = CONFIG.get(K_CONFIG_FILE)
+        pretty_print(f'{NORM_MGT}使用“DataX作业配置文件生成器”配置文件')
+        if isinstance(generator_config, list):
+            # 以列表形式提供的
+            for c in generator_config:
+                generator_config_files.add(get_abs_path(c, check_exist=False))
+        else:
+            # 以逗号分隔形式提供的
+            for c in generator_config.split(','):
+                generator_config_files.add(get_abs_path(c, check_exist=False))
+    elif CONFIG.__contains__(K_DIRECTORY):
+        # 传递了含有`DataX作业配置生成器配置.ini文件`的目录
+        pretty_print(f'{NORM_MGT}使用“DataX作业配置文件生成器”配置文件目录')
+        recursive = CONFIG.get('r', False)
+        generator_config_dirs = []
+        if isinstance(CONFIG.get(K_DIRECTORY), str):
+            dir_conf = CONFIG.get(K_DIRECTORY).split(',')  # type:list
+            # 以逗号分隔的多个目录
+        else:
+            # 目录列表
+            dir_conf = CONFIG.get(K_DIRECTORY)  # type:list
+        for d in dir_conf:
+            d = get_abs_path(d, check_exist=False)
+            if not os.path.exists(d):
+                pretty_print(f'{NORM_YEL}“DataX作业配置文件生成器”配置文件目录 {NORM_GRN}{d}{NORM_YEL} 不存在')
+                raise FileNotFoundError(d)
+            if not os.path.isdir(d):
+                pretty_print(f'{NORM_YEL}“DataX作业配置文件生成器”配置文件目录 {NORM_GRN}{d}{NORM_YEL} 存在,但不是目录')
+                raise NotADirectoryError(d)
+            generator_config_dirs.append(d)
+        for each_dir in generator_config_dirs:
+            # 递归处理目录
+            files = list_files(each_dir, recursive)
+            for file in files:
+                generator_config_files.add(file)
+    return generator_config_files
+
+
+if __name__ == '__main__':
+    pretty_print(f'{NORM_MGT}{sys.argv[0]} 收到参数:{NORM_GRN}{" ".join(sys.argv[1:])}')
+    CONFIG, _ = parse_args(sys.argv[1:])
+    # 未提供任何参数或查看帮助
+    if len(sys.argv) == 1 or exist(CONFIG, KL_HELP):
+        usage(0)
+    # DataX作业配置生成器配置文件列表
+    generator_config_files = collect_generate_config_files()
+    if len(generator_config_files) == 0:
+        pretty_print(f'{NORM_RED}未找到任何有效的“DataX作业配置文件生成器”配置文件')
+        exit(1)
+    # 检查所有文件是否都存在
+    for gcf in generator_config_files:
+        short_name = gcf.replace(f'{project_root_dir}/', '')
+        try:
+            if not os.path.exists(gcf):
+                pretty_print(f'{NORM_YEL}“DataX作业配置文件生成器”配置 {NORM_GRN}{gcf}{NORM_YEL} 不存在')
+                raise FileNotFoundError(gcf)
+            elif not os.path.isfile(gcf):
+                pretty_print(f'{NORM_YEL}“DataX作业配置文件生成器”配置 {NORM_GRN}{gcf}{NORM_YEL} 存在,但不是文件')
+                raise IsADirectoryError(gcf)
+        except Exception as e:
+            pretty_print(f'使用配置文件 {short_name} 生成DataX作业配置文件(.json)失败')
+            raise e
+    # 开始生成`DataX作业配置文件`
+    start_date = CONFIG.get('start-date', get_yesterday())
+    stop_date = CONFIG.get('stop-date', get_today())
+    for gcf in generator_config_files:
+        # gcf应形如:${project_base_dir}/conf/datax/config/${src-type}-${dst-type}/${project}-${layer}-${env}/${src-type}-${dst-type}-${src-name}.ini
+        # ${project}-${layer}-${env}
+        temp = os.path.dirname(gcf).replace(f'{project_root_dir}/', '').replace(f'conf/datax/config/', '').split('/')
+        src_dst = temp[0]
+        if len(temp) > 1:
+            project_layer_env = temp[1]
+        else:
+            project_layer_env = 'default'
+        # project_layer_env = os.path.basename(path.dirname(gcf))
+        # src_dst = os.path.basename(path.dirname(path.dirname(gcf)))
+        # 默认输出(绝对)路径
+        default_output_dir = f'{project_root_dir}/conf/datax/generated'
+        job_config_name = os.path.basename(gcf).replace('.ini', '.json')
+
+        # 指定的输出路径,可以是绝对路径,也可以是相对(项目根目录)的路径
+        output_path_arr = ([CONFIG.get("o", default_output_dir), src_dst, project_layer_env] + temp[2:]
+                           + [job_config_name])
+        output = '/'.join(output_path_arr)
+
+        os.system(f'mkdir -p {os.path.dirname(output)}')
+        try:
+            pretty_print(f'{NORM_MGT}开始使用 {NORM_GRN}{gcf}{NORM_MGT} 生成DataX作业配置文件')
+            job_config_generator = JobConfigGenerator(project_root_dir, gcf, start_date, stop_date, output)
+            job_config_generator.run()
+            pretty_print(f'{NORM_MGT}DataX作业配置文件 {NORM_GRN}{output}{NORM_MGT} 生成成功')
+        except Exception as e:
+            pretty_print(f'使用配置文件 {gcf} 生成DataX作业配置文件(.json)失败')
+            pretty_print(f'{NORM_MGT}使用配置文件 ${NORM_GRN}{gcf} {NORM_MGT}生成DataX作业配置文件失败')
+            raise e

+ 13 - 0
bin/datax-multiple-hive-job-starter.py

@@ -0,0 +1,13 @@
+#!/usr/bin/env /usr/bin/python3
+# -*- coding:utf-8 -*-
+"""
+  Note:为方便本地调试设计,请勿在调度中使用
+"""
+import os
+import sys
+
+project_root_dir = os.path.dirname(os.path.dirname(os.path.abspath(__file__)))
+sys.path.append(project_root_dir)
+
+if __name__ == '__main__':
+    os.system(f'{project_root_dir}/bin/datax-multiple-hive-job-starter.sh {" ".join(sys.argv[1:])}')

+ 262 - 0
bin/datax-multiple-hive-job-starter.sh

@@ -0,0 +1,262 @@
+#!/bin/bash
+#--------------------------------------------------------------------------------------------------
+# 分布式并行启动多个DataX MySQL-Hive作业
+# 1. 可以同时通过4种方式来指定作业,但作业中是否有重复的配置,需要开发者来判断
+# 2. 可以传递--override来覆盖脚本内的所有配置,重新传递要执行的作业(方便单独跑失败的作业)
+# 3. 运行模式:本机串行、随机串行(意义不大)、本机并行(默认模式)、随机并行
+#--------------------------------------------------------------------------------------------------
+set -e
+BASE_DIR=$(
+  cd "$(dirname "$(realpath "$0")")/.." || exit
+  pwd
+)
+. "${BASE_DIR}"/bin/common/init.sh
+function usage() {
+  echo -e "${NORM_MGT}Usage: $0
+  ${NORM_CYN}\t[-h/-H/--h/--H/--help]                打印脚本使用方法${DO_RESET}"
+  echo -e "${NORM_MGT}Usage: $0
+  ${NORM_CYN}\t[--override]                          如果出现override,则只执行传入的配置,文件里定义的配置被忽略
+  ${NORM_CYN}\t[-t< /=>table]                        需要建分区的表
+  ${NORM_CYN}\t[-jc< /=>job config]                  DataX作业配置文件(json)
+  ${NORM_CYN}\t[-jcd< /=>job config directory]       DataX作业配置文件(json)夹
+  ${NORM_CYN}\t[-gc< /=>generator config]            DataX作业配置文件生成器的配置文件(ini)
+  ${NORM_CYN}\t[-gcd< /=>generator config directory] DataX作业配置文件生成器的配置文件(ini)夹
+  ${NORM_CYN}\t[-start-date< /=>start date]          开始日期(用以筛选数据)
+  ${NORM_CYN}\t[-stop-date< /=>stop date]            结束日期(用以筛选数据)
+  ${NORM_CYN}\t[-skip-add-partition]                 跳过添加分区
+  ${NORM_CYN}\t[-skip-datax]                         跳过DataX导出作业
+  ${NORM_CYN}\t[-random]                             随机选择Worker(默认本机执行,易造成压力大)
+  ${NORM_CYN}\t[-parallel]                           并行执行(默认串行)
+  ${DO_RESET}"
+  exit "$1"
+}
+
+function parse_args() {
+  for index in $(seq 1 $#); do
+    arg=${*:index:1}
+    case $arg in
+    --override)
+      partitioned_tables=()
+      job_config_array=()
+      job_config_directory_array=()
+      generator_config_array=()
+      generator_config_directory_array=()
+      ;;
+    *) ;;
+    esac
+  done
+
+  for index in $(seq 1 $#); do
+    arg=${*:index:1}
+    case $arg in
+    -t)
+      index=$((index + 1))
+      TABLE="${*:index:1}"
+      partitioned_tables+=("${TABLE}")
+      ;;
+    -t=*)
+      TABLE="${arg#*=}"
+      partitioned_tables+=("${TABLE}")
+      ;;
+    -jc)
+      index=$((index + 1))
+      JOB_CONFIG="${*:index:1}"
+      job_config_array+=("${JOB_CONFIG}")
+      ;;
+    -jc=*)
+      JOB_CONFIG="${arg#*=}"
+      job_config_array+=("${JOB_CONFIG}")
+      ;;
+    -jcd)
+      index=$((index + 1))
+      JCD="${*:index:1}"
+      job_config_directory_array+=("${JCD}")
+      ;;
+    -jcd=*)
+      TABLE="${arg#*=}"
+      job_config_directory_array+=("${JCD}")
+      ;;
+    -gc)
+      index=$((index + 1))
+      GC="${*:index:1}"
+      generator_config_array+=("${GC}")
+      ;;
+    -gc=*)
+      GC="${arg#*=}"
+      generator_config_array+=("${GC}")
+      ;;
+    -gcd)
+      index=$((index + 1))
+      GCD="${*:index:1}"
+      generator_config_directory_array+=("${GCD}")
+      ;;
+    -gcd=*)
+      GCD="${arg#*=}"
+      generator_config_directory_array+=("${GCD}")
+      ;;
+    -start-date)
+      index=$((index + 1))
+      START_DATE="${*:index:1}"
+      ;;
+    -start-date=*)
+      START_DATE="${arg#*=}"
+      ;;
+    -stop-date)
+      index=$((index + 1))
+      STOP_DATE="${*:index:1}"
+      ;;
+    -stop-date=*)
+      STOP_DATE="${arg#*=}"
+      ;;
+    -skip-add-partition)
+      SKIP_ADD_PARTITION="true"
+      ;;
+    -skip-datax)
+      DEFAULT_ARGS+=("-skip-datax")
+      ;;
+    -random)
+      DEFAULT_ARGS+=("-random")
+      ;;
+    -parallel)
+      DEFAULT_ARGS+=("-parallel")
+      ;;
+    -h | -H | --h | --H | --help)
+      usage 0
+      ;;
+    *) ;;
+    esac
+  done
+  pretty_print "${NORM_MGT}${0} 收到参数:${NORM_GRN}${*}"
+}
+
+function parse_ddl() {
+  generator_config="${1}"
+  if [ ! -f "${generator_config}" ]; then
+    generator_config_path="${BASE_DIR}/${generator_config}"
+  else
+    generator_config_path="${generator_config}"
+  fi
+  if [ ! -f "${generator_config_path}" ]; then
+    # 没有找到配置文件
+    DDL=""
+    return
+  fi
+  path=$(grep "path =" "${generator_config_path}")
+  if [ "$(echo "${path}" | grep -c "/dt=\${dt}")" -eq 0 ]; then
+    # 非分区表
+    DDL=""
+    return
+  fi
+  if [[ "${path}" =~ .*\.db.* ]]; then
+    hive_db_name=$(echo "${path}" | awk -F'/' '{ for(i=1; i<=NF; i++) if($i ~ /\./) { print $i; exit } }' | cut -d '.' -f1)
+    hive_table_name=$(echo "${path}" | awk -F'/' '{ for(i=1; i<=NF; i++) if($i ~ /\./) { print $(i+1); exit } }')
+  else
+    hive_db_name="tmp"
+    hive_table_name=$(echo "${path}" | cut -d '/' -f5)
+  fi
+  DDL="ALTER TABLE ${hive_db_name}.${hive_table_name} ADD IF NOT EXISTS PARTITION(dt=${START_DATE});"
+}
+
+partitioned_tables=(
+  # 示例:`project`_`layer`.`layer`_`project`_`mysql-table-name`
+)
+# DataX mysql-hive配置文件(json)
+job_config_array=(
+  # 示例:conf/datax/generated/mysql-hive-`mysql-db-name`-`mysql-table-name`.json
+)
+job_config_directory_array=(
+  # 示例:conf/datax/generated
+)
+# DataX作业配置生成器的配置文件
+generator_config_array=(
+  # 示例:conf/datax/config/mysql-hdfs/`project`_`layer`/mysql-hive-`mysql-db-name`-`mysql-table-name`.ini
+  #  conf/datax/config/mysql-hdfs/bms_ods_test/mysql-hdfs-ik_bms_test-activity_labels.ini
+  #  conf/datax/config/mysql-hdfs/bms_ods_test/mysql-hdfs-ik_bms_test-ar_internal_metadata.ini
+)
+generator_config_directory_array=(
+  # 示例:conf/datax/config/mysql-hdfs/`project`_`layer`/
+  #  conf/datax/config/mysql-hdfs/bms_ods
+  #  conf/datax/config/mysql-hdfs/bms_ods_test
+  #  conf/datax/config/mysql-hdfs/crm_ods_dl
+  #  conf/datax/config/mysql-hdfs/jqr_ods
+  #  conf/datax/config/mysql-hdfs/skb_ods
+)
+DEFAULT_ARGS=()
+parse_args "${@}"
+if [ "$(uname)" = "Linux" ]; then
+  YESTERDAY=$(date -d '-1 day' +%Y%m%d)
+  TODAY=$(date +%Y%m%d)
+else
+  YESTERDAY=$(date -v-1d +%Y%m%d)
+  TODAY=$(date +%Y%m%d)
+fi
+if [ -z "${START_DATE}" ]; then
+  START_DATE=${YESTERDAY}
+fi
+if [ -z "${STOP_DATE}" ]; then
+  STOP_DATE=${TODAY}
+fi
+DEFAULT_ARGS+=("-start-date=${START_DATE}")
+DEFAULT_ARGS+=("-stop-date=${STOP_DATE}")
+HIVE_DDL=()
+# 显式声明的表
+for table in "${partitioned_tables[@]}"; do
+  HIVE_DDL+=("ALTER TABLE ${table} add partition(dt=${START_DATE});")
+done
+# 从DataX作业配置生成器配置文件名称中解析出Hive表名
+for generator_config in "${generator_config_array[@]}"; do
+  # 形如:conf/datax/config/mysql-hdfs/`project`_`layer`/mysql-hdfs-`mysql-db-name`-`mysql-table-name`.ini
+  parse_ddl "${generator_config}"
+  if [ -n "${DDL}" ]; then
+    HIVE_DDL+=("${DDL}")
+  fi
+done
+# 从DataX作业配置生成器配置文件中解析出的表
+for generator_config_directory in "${generator_config_directory_array[@]}"; do
+  # 形如:conf/datax/config/mysql-hdfs/`project`_`layer`/
+  if [ ! -f "${generator_config_directory}" ]; then
+    generator_config_directory="${BASE_DIR}/${generator_config_directory}"
+  fi
+  pretty_print "${NORM_MGT}处理生成器配置文件目录 ${NORM_GRN}${generator_config_directory}"
+  for generator_config in "${generator_config_directory}"/*; do
+    # 形如:conf/datax/config/mysql-hdfs/`project`_`layer`/mysql-hdfs-`mysql-db-name`-`mysql-table-name`.ini
+    parse_ddl "${generator_config}"
+    if [ -n "${DDL}" ]; then
+      HIVE_DDL+=("${DDL}")
+    fi
+  done
+done
+if [ -n "${SKIP_ADD_PARTITION}" ]; then
+  pretty_print "${NORM_YEL}跳过添加Hive分区(-skip-add-partition)"
+else
+  if [ ${#HIVE_DDL[@]} -eq 0 ]; then
+    pretty_print "${NORM_YEL}没有需要创建Hive新分区的表"
+  fi
+  for ddl in "${HIVE_DDL[@]}"; do
+    pretty_print "${NORM_MGT}创建Hive新分区:${NORM_GRN}${ddl}"
+  done
+  if [ "${#HIVE_DDL[@]}" -gt 0 ]; then
+    hive -e "${HIVE_DDL[*]}"
+  fi
+fi
+JOB_CONFIG=()
+for job_config in "${job_config_array[@]}"; do
+  JOB_CONFIG+=("-c=${job_config}")
+done
+GENERATOR_CONFIG=()
+for generator_config in "${generator_config_array[@]}"; do
+  GENERATOR_CONFIG+=("-gc=${generator_config}")
+done
+# 运行DataX作业配置文件列表中定义的作业
+if [ "${#JOB_CONFIG[@]}" -gt 0 ]; then
+  "${BASE_DIR}"/bin/datax-multiple-job-starter.sh "${JOB_CONFIG[@]}" "${DEFAULT_ARGS[@]}"
+fi
+for job_config_directory in "${job_config_directory_array[@]}"; do
+  "${BASE_DIR}"/bin/datax-multiple-job-starter.sh "-cd=${job_config_directory}" "${DEFAULT_ARGS[@]}"
+done
+if [ "${#GENERATOR_CONFIG[@]}" -gt 0 ]; then
+  "${BASE_DIR}"/bin/datax-multiple-job-starter.sh "${GENERATOR_CONFIG[@]}" "${DEFAULT_ARGS[@]}"
+fi
+for generator_config_directory in "${generator_config_directory_array[@]}"; do
+  "${BASE_DIR}"/bin/datax-multiple-job-starter.sh "-gcd=${generator_config_directory}" "${DEFAULT_ARGS[@]}"
+done

+ 13 - 0
bin/datax-multiple-job-starter.py

@@ -0,0 +1,13 @@
+#!/usr/bin/env /usr/bin/python3
+# -*- coding:utf-8 -*-
+"""
+  Note:为方便本地调试设计,请勿在调度中使用
+"""
+import os
+import sys
+
+project_root_dir = os.path.dirname(os.path.dirname(os.path.abspath(__file__)))
+sys.path.append(project_root_dir)
+
+if __name__ == '__main__':
+    os.system(f'{project_root_dir}/bin/datax-multiple-job-starter.sh {" ".join(sys.argv[1:])}')

+ 264 - 0
bin/datax-multiple-job-starter.sh

@@ -0,0 +1,264 @@
+#!/bin/bash
+#--------------------------------------------------------------------------------------------------
+# 启动多个DataX作业
+# 1. 注意确定DataX Workers —— `DATAX_WORKERS`
+# 2. `multiple-job-starter`不会打印日志,请前往`/${LOG_ROOT_DIR}/data/log/datax/${START_DATE}`查看日志
+# 3. 配置调度使用本脚本时如果指定了`-random`选择随机worker,则调度节点必须选择`local-worker`
+#--------------------------------------------------------------------------------------------------
+#set -e
+BASE_DIR=$(
+  cd "$(dirname "$(realpath "$0")")/.." || exit
+  pwd
+)
+. "${BASE_DIR}"/bin/common/init.sh
+
+function usage() {
+  echo -e "${NORM_MGT}Usage: $0
+  ${NORM_CYN}\t[-h/-H/--h/--H/--help]                打印脚本使用方法${DO_RESET}"
+  echo -e "${NORM_MGT}Usage: $0
+  ${NORM_GRN}\t<-c< /=>job config>                   DataX作业配置文件(.json,绝对路径),支持多个,优先级1
+  ${NORM_GRN}\t<-cd< /=>job config directory>        DataX作业配置文件目录(.json文件夹,项目内相对路径或绝对路径),优先级2
+  ${NORM_GRN}\t<-gc< /=>generator config>            DataX作业配置生成器配置文件(.ini,相对路径),支持多个,优先级3
+  ${NORM_GRN}\t<-gcd< /=>generator config directory> DataX作业配置生成器配置文件目录(.ini文件夹,项目内相对路径或绝对路径),优先级4
+  ${NORM_CYN}\t[-start-date< /=>start date]          开始日期(用以筛选数据)
+  ${NORM_CYN}\t[-stop-date< /=>stop date]            结束日期(用以筛选数据)
+  ${NORM_CYN}\t[-host< /=>host]                      执行作业的主机,非${RELEASE_USER}用户或host和r都未指定则在当前机器执行,指定host优先于随机选择主机
+  ${NORM_CYN}\t[-random]                             随机选择Worker,非${RELEASE_USER}用户或host和random都未指定则在当前机器执行
+  ${NORM_CYN}\t[-parallel]                           并行执行(默认串行)
+  ${NORM_CYN}\t[-skip-datax]                         跳过DataX导出作业
+  ${DO_RESET}"
+  exit "$1"
+}
+
+function parse_args() {
+  for index in $(seq 1 $#); do
+    arg=${*:index:1}
+    case $arg in
+    -c)
+      index=$((index + 1))
+      JOB_CONFIG_PATH+=("${*:index:1}")
+      ;;
+    -c=*)
+      JOB_CONFIG_PATH+=("${arg#*=}")
+      ;;
+    -cd)
+      index=$((index + 1))
+      JOB_CONFIG_DIR="${*:index:1}"
+      ;;
+    -cd=*)
+      JOB_CONFIG_DIR="${arg#*=}"
+      ;;
+    -gc)
+      index=$((index + 1))
+      GENERATOR_CONFIG_PATH+=("${*:index:1}")
+      ;;
+    -gc=*)
+      GENERATOR_CONFIG_PATH+=("${arg#*=}")
+      ;;
+    -gcd)
+      index=$((index + 1))
+      GENERATOR_CONFIG_DIR="${*:index:1}"
+      ;;
+    -gcd=*)
+      GENERATOR_CONFIG_DIR="${arg#*=}"
+      ;;
+    -start-date)
+      index=$((index + 1))
+      START_DATE="${*:index:1}"
+      ;;
+    -start-date=*)
+      START_DATE="${arg#*=}"
+      ;;
+    -stop-date)
+      index=$((index + 1))
+      STOP_DATE="${*:index:1}"
+      ;;
+    -stop-date=*)
+      STOP_DATE="${arg#*=}"
+      ;;
+    -host)
+      index=$((index + 1))
+      PASS_ON_ARGS+=("-host=${*:index:1}")
+      ;;
+    -host=*)
+      PASS_ON_ARGS+=("-host=${arg#*=}")
+      ;;
+    -random)
+      PASS_ON_ARGS+=("$arg")
+      ;;
+    -skip-datax)
+      PASS_ON_ARGS+=("$arg")
+      ;;
+    -parallel)
+      PARALLEL="true"
+      ;;
+    -h | -H | --h | --H | --help)
+      usage 0
+      ;;
+    *) ;;
+    esac
+  done
+  pretty_print "${NORM_MGT}${0} 收到参数:${NORM_GRN}${*}"
+  if [ "${#JOB_CONFIG_PATH[@]}" -eq 0 ] && [ -z "${JOB_CONFIG_DIR}" ] && [ "${#GENERATOR_CONFIG_PATH[@]}" -eq 0 ] && [ -z "${GENERATOR_CONFIG_DIR}" ]; then
+    pretty_print "${NORM_RED}请至少提供 ${NORM_GRN}-c、-cd、-gc、-gcd ${NORM_RED}中的一个"
+    usage 1
+  fi
+}
+
+function prepare() {
+  if [ "$(uname)" = "Linux" ]; then
+    YESTERDAY=$(date -d '-1 day' +%Y%m%d)
+    TODAY=$(date +%Y%m%d)
+  else
+    YESTERDAY=$(date -v-1d +%Y%m%d)
+    TODAY=$(date +%Y%m%d)
+  fi
+  if [ -z "${START_DATE}" ]; then
+    START_DATE=${YESTERDAY}
+  fi
+  if [ -z "${STOP_DATE}" ]; then
+    STOP_DATE=${TODAY}
+  fi
+  PASS_ON_ARGS+=("-start-date=${START_DATE}")
+  PASS_ON_ARGS+=("-stop-date=${STOP_DATE}")
+  if [ "${#JOB_CONFIG_PATH[@]}" -gt 0 ]; then
+    # 传递了`作业配置文件(列表)`
+    for jc in "${JOB_CONFIG_PATH[@]}"; do
+      if [[ "${JC_GC_ARGS[*]}" =~ .*"${jc}".* ]]; then
+        pretty_print "${NORM_YEL}提供了重复的DataX作业配置文件:${NORM_GRN}${jc}"
+        continue
+      fi
+      JC_GC_ARGS+=("-c=${jc}")
+    done
+  elif [ -n "${JOB_CONFIG_DIR}" ]; then
+    # 传递了`作业配置文件目录`
+    if [ ! -d "${JOB_CONFIG_DIR}" ]; then
+      # 目录不存在(可能原因是未在根目录执行脚本,传递的目录一定要是`相对路径`才可以)
+      JCD="${BASE_DIR}/${JOB_CONFIG_DIR}"
+      if [ ! -d "${JCD}" ]; then
+        pretty_print "${NORM_RED}提供的DataX作业配置目录 ${NORM_GRN}${JOB_CONFIG_DIR} ${NORM_RED}不存在"
+        exit 1
+      fi
+      JOB_CONFIG_DIR="${JCD}"
+    fi
+    for jc in "${JOB_CONFIG_DIR}"/*; do
+      # 需要`jc`都是json文件
+      if [ -f "${jc}" ]; then
+        JC_GC_ARGS+=("-c=${jc}")
+      fi
+    done
+  elif [ "${#GENERATOR_CONFIG_PATH[@]}" -gt 0 ]; then
+    # 传递了`DataX作业配置生成器配置文件(列表)`
+    for gc in "${GENERATOR_CONFIG_PATH[@]}"; do
+      if [[ "${JC_GC_ARGS[*]}" =~ .*"${gc}".* ]]; then
+        pretty_print "${NORM_YEL}提供了重复的DataX作业配置生成器配置文件:${NORM_GRN}${gc}"
+        continue
+      fi
+      JC_GC_ARGS+=("-gc=${gc}")
+    done
+  else
+    # 传递了`DataX作业配置生成器配置文件目录`
+    if [ ! -d "${GENERATOR_CONFIG_DIR}" ]; then
+      # 目录不存在(可能原因是未在根目录执行脚本,传递的目录一定要是相对路径才可以)
+      GCD="${BASE_DIR}/${GENERATOR_CONFIG_DIR}"
+      if [ ! -d "${GCD}" ]; then
+        pretty_print "${NORM_RED}提供的DataX作业配置生成器配置文件目录 ${NORM_GRN}${GENERATOR_CONFIG_DIR} ${NORM_RED}不存在"
+        exit 1
+      fi
+      GENERATOR_CONFIG_DIR="${GCD}"
+    fi
+    for gc in "${GENERATOR_CONFIG_DIR}"/*; do
+      # 需要`gc`都是ini文件
+      if [ -f "${gc}" ]; then
+        JC_GC_ARGS+=("-gc=${gc}")
+      fi
+    done
+  fi
+}
+
+function run_multiple_datax_job() {
+  success_count=0
+  failure_count=0
+  export MULTIPLE="true"
+  for arg in "${JC_GC_ARGS[@]}"; do
+    TEMP=$(dirname "${arg#*=}")
+    TEMP=${TEMP#"${BASE_DIR}/"}
+    TEMP=${TEMP#"conf/datax/config/"}
+    TEMP=${TEMP#"conf/datax/generated/"}
+    SRC_DST=$(echo "${TEMP}" | cut -d '/' -f1)
+    PROJECT_LAYER_ENV=$(echo "${TEMP}" | cut -d '/' -f2)
+    DB_ENV=$(echo "${TEMP}" | cut -d '/' -f3)
+    GROUP=$(echo "${TEMP}" | cut -d '/' -f4)
+    case ${arg} in
+    -c=*)
+      # -c=conf/datax/generated/from-to-database-table.json
+      JOB_NAME=$(basename "${arg#*=}" .json)
+      pretty_print "${NORM_MGT}使用DataX作业配置文件 ${NORM_GRN}${arg#*=} ${NORM_MGT}运行"
+      ;;
+    -gc=*)
+      # -gc=conf/datax/config/from-to/project_layer/from-to-database-table.ini
+      JOB_NAME=$(basename "${arg#*=}" .ini)
+      pretty_print "${NORM_MGT}使用DataX作业配置生成器配置文件 ${NORM_GRN}${arg#*=} ${NORM_MGT}运行"
+      ;;
+    *)
+      continue
+      ;;
+    esac
+    if [ "${USER}" == "${RELEASE_USER}" ]; then
+      if [ -n "${GROUP}" ]; then
+        LOG_RELATIVE_PATH="datax/${SRC_DST}/${PROJECT_LAYER_ENV}/${DB_ENV}/${GROUP}/${START_DATE}"
+      elif [ -n "${DB_ENV}" ]; then
+        LOG_RELATIVE_PATH="datax/${SRC_DST}/${PROJECT_LAYER_ENV}/${DB_ENV}/${START_DATE}"
+      else
+        LOG_RELATIVE_PATH="datax/${SRC_DST}/${PROJECT_LAYER_ENV}/${START_DATE}"
+      fi
+    else
+      if [ -n "${GROUP}" ]; then
+        LOG_RELATIVE_PATH="users/${USER}/datax/${SRC_DST}/${PROJECT_LAYER_ENV}/${DB_ENV}/${GROUP}/${START_DATE}"
+      elif [ -n "${DB_ENV}" ]; then
+        LOG_RELATIVE_PATH="users/${USER}/datax/${SRC_DST}/${PROJECT_LAYER_ENV}/${DB_ENV}/${START_DATE}"
+      else
+        LOG_RELATIVE_PATH="users/${USER}/datax/${SRC_DST}/${PROJECT_LAYER_ENV}/${START_DATE}"
+      fi
+    fi
+    LOG_DIR="${LOG_ROOT_DIR}/${LOG_RELATIVE_PATH}"
+    mkdir -p "${LOG_DIR}"
+    LOG_FILE_NAME="${START_DATE}-${JOB_NAME}.log"
+    LOG_FILE="${LOG_DIR}/${LOG_FILE_NAME}"
+    if [[ -n "${IS_RUN_BY_NORMAL_USER}" ]] || [ -z "${PARALLEL}" ] || [ "${CURRENT_HOST}" != "${RELEASE_HOST}" ]; then
+      # 普通用户或未指定并行或非发布主机
+      pretty_print "${NORM_MGT}日志将写入文件 ${NORM_GRN}${LOG_FILE} ${NORM_MGT}"
+      "${BASE_DIR}"/bin/datax-single-job-starter.sh "${PASS_ON_ARGS[@]}" "${arg}" | tee "${LOG_FILE}"
+      if [ "${PIPESTATUS[0]}" -eq 0 ]; then
+        success_count=$((success_count + 1))
+      else
+        failure_count=$((failure_count + 1))
+      fi
+    else
+      # 发布用户且指定并行
+      pretty_print "${NORM_MGT}日志将写入文件 ${NORM_GRN}${LOG_FILE} ${NORM_MGT}"
+      "${BASE_DIR}"/bin/datax-single-job-starter.sh "${PASS_ON_ARGS[@]}" "${arg}" >"${LOG_FILE}" 2>&1 &
+      success_count=$((success_count + 1))
+    fi
+    sleep 0.5s
+  done
+  if [[ -n "${IS_RUN_BY_NORMAL_USER}" ]] || [ -z "${PARALLEL}" ]; then
+    # 普通用户或未指定并行
+    pretty_print "${NORM_MGT}所有DataX作业都已完成,成功 ${NORM_GRN}${success_count}${NORM_MGT} 个,失败 ${NORM_RED}${failure_count}${NORM_MGT} 个"
+    exit ${failure_count}
+  else
+    pretty_print "${NORM_MGT}所有DataX作业都已启动(共启动 ${NORM_GRN}${success_count}${NORM_MGT} 个)"
+  fi
+}
+
+# DataX作业配置文件列表
+JOB_CONFIG_PATH=()
+# DataX作业配置生成器配置文件列表
+GENERATOR_CONFIG_PATH=()
+# 不作任何处理,传递给`datax-single-job-starter.sh`的参数
+PASS_ON_ARGS=()
+# 需作处理,加上`-c/-gc`的`DataX作业配置文件`或`DataX作业配置生成器配置文件`
+JC_GC_ARGS=()
+parse_args "${@}"
+prepare
+run_multiple_datax_job

+ 13 - 0
bin/datax-single-job-starter.py

@@ -0,0 +1,13 @@
+#!/usr/bin/env /usr/bin/python3
+# -*- coding:utf-8 -*-
+"""
+  Note:为方便本地调试设计,请勿在调度中使用
+"""
+import os
+import sys
+
+project_root_dir = os.path.dirname(os.path.dirname(os.path.abspath(__file__)))
+sys.path.append(project_root_dir)
+
+if __name__ == '__main__':
+    os.system(f'{project_root_dir}/bin/datax-single-job-starter.sh {" ".join(sys.argv[1:])}')

+ 277 - 0
bin/datax-single-job-starter.sh

@@ -0,0 +1,277 @@
+#!/bin/bash
+#--------------------------------------------------------------------------------------------------
+# 启动单个DataX作业
+# 1. 注意确定Python3的路径`PYTHON3_PATH'、DataX的安装目录`DATAX_HOME`以及DataX Workers的声明——`datax_workers`
+# 2. 可以用-c传入生成好的DataX作业Json配置文件(绝对路径),或-gc传入ini文件(相对路径)
+# 3. ini文件的名称格式为:源系统类型-目标系统类型-数据库名称-数据集名称.ini
+#--------------------------------------------------------------------------------------------------
+CURRENT_DIR=$(pwd)
+BASE_DIR=$(
+  cd "$(dirname "$(realpath "$0")")/.." || exit
+  pwd
+)
+. "${BASE_DIR}"/bin/common/init.sh
+
+function usage() {
+  echo -e "${NORM_MGT}Usage: $0
+  ${NORM_CYN}\t[-h/-H/--h/--H/--help]       打印脚本使用方法${DO_RESET}"
+  echo -e "${NORM_MGT}Usage: $0
+  ${NORM_GRN}\t<-c< /=>job config>          DataX作业配置文件(.json,绝对路径)
+  ${NORM_GRN}\t<-gc< /=>generator config>   DataX作业配置生成器配置文件(.ini,项目内相对路径或绝对路径),-c优先
+  ${NORM_CYN}\t[-start-date< /=>start date] 开始日期(用以筛选数据)
+  ${NORM_CYN}\t[-stop-date< /=>stoP date]   结束日期(用以筛选数据)
+  ${NORM_CYN}\t[-host< /=>host]             执行作业的主机,非${RELEASE_USER}用户或host和random都未指定则在当前机器执行,指定host优先于随机选择主机
+  ${NORM_CYN}\t[-random]                    随机选择主机,非${RELEASE_USER}用户或host和random都未指定则在当前机器执行
+  ${NORM_CYN}\t[-skip-datax]                跳过DataX导出作业
+  ${DO_RESET}"
+  exit "$1"
+}
+
+function select_worker() {
+  pretty_print "${NORM_MGT}本次作业执行用户为 ${NORM_GRN}${USER}"
+  if [ -z "${IS_RUN_BY_RELEASE_USER}" ]; then
+    # 非 ${RELEASE_USER} 用户只能在本机执行
+    selected_worker=${CURRENT_HOST}
+    pretty_print "${NORM_MGT}非 ${NORM_GRN}${RELEASE_USER}${NORM_MGT} 用户限制以本机 ${NORM_GRN}${CURRENT_HOST} ${NORM_MGT}为Worker运行作业"
+  elif [ -z "${IS_RUN_IN_RELEASE_DIR}" ]; then
+    # 非 ${RELEASE_USER} 用户只能在本机执行
+    selected_worker=${CURRENT_HOST}
+    pretty_print "${NORM_MGT}非发布目录 (${NORM_GRN}${RELEASE_ROOT_DIR}${NORM_MGT}) 下限制以本机 ${NORM_GRN}${CURRENT_HOST} ${NORM_MGT}为Worker运行作业"
+  else
+    if [ -n "${HOST}" ]; then
+      pretty_print "${NORM_MGT}用户指定执行Worker为 ${NORM_GRN}${HOST}"
+      selected_worker=${HOST}
+    elif [ -n "${IS_RANDOM}" ]; then
+      # 生成一个>=0, <数组长度的随机数
+      worker_index=$((RANDOM % ${#DATAX_WORKERS_QUEUE[@]}))
+      selected_worker=${DATAX_WORKERS_QUEUE[${worker_index}]}
+      pretty_print "${NORM_MGT}用户指定随机选择Worker, 执行Worker为 ${NORM_GRN}${selected_worker}"
+    else
+      # 只能在本机执行的情况
+      selected_worker=${CURRENT_HOST}
+      pretty_print "${NORM_MGT}用户既未指定Worker,也未选择随机决定Worker, 执行Worker为本机 ${NORM_GRN}${CURRENT_HOST}"
+    fi
+  fi
+}
+
+function generate_job_config() {
+  if [ -z "${JOB_CONFIG_PATH}" ]; then
+    # 未提供`DataX作业配置文件`
+    # 由提供的`DataX作业配置生成器配置文件`生成`DataX作业配置文件`
+    if [ "${selected_worker}" == "${CURRENT_HOST}" ]; then
+      ${PYTHON3_PATH} -u "${BASE_DIR}"/bin/datax-job-config-generator.py \
+        -c "${GENERATOR_CONFIG_PATH}" \
+        -start-date "${START_DATE}" \
+        -stop-date "${STOP_DATE}"
+    else
+      ssh "${selected_worker}" "${PYTHON3_PATH}" -u "${BASE_DIR}"/bin/datax-job-config-generator.py \
+        -c "${GENERATOR_CONFIG_PATH}" \
+        -start-date "${START_DATE}" \
+        -stop-date "${STOP_DATE}"
+    fi
+    # shellcheck disable=SC2181
+    if [ "$?" -ne 0 ]; then
+      pretty_print "${NORM_MGT}使用配置文件 ${NORM_GRN}${GENERATOR_CONFIG_RELATIVE_PATH} ${NORM_MGT}生成DataX作业配置文件失败"
+      #      fi
+      exit 1
+    fi
+    TEMP=$(dirname "${GENERATOR_CONFIG_PATH}")
+    TEMP=${TEMP#"${BASE_DIR}/"}
+    TEMP=${TEMP#"conf/datax/config/"}
+    SRC_DST=$(echo "${TEMP}" | cut -d '/' -f1)
+    PROJECT_LAYER_ENV=$(echo "${TEMP}" | cut -d '/' -f2)
+    DB_ENV=$(echo "${TEMP}" | cut -d '/' -f3)
+    GROUP=$(echo "${TEMP}" | cut -d '/' -f4)
+    NEW=$(echo "${TEMP}" | cut -d '/' -f5)
+
+    # 修改生成的作业名称,能够识别多级目录
+    JOB_NAME=$(basename "${GENERATOR_CONFIG_PATH}" .ini)
+    JOB_CONFIG_RELATIVE_PATH="conf/datax/generated/${TEMP}/${JOB_NAME}.json"
+#    if [ -n "${NEW}" ]; then
+#      JOB_CONFIG_RELATIVE_PATH="conf/datax/generated/${SRC_DST}/${PROJECT_LAYER_ENV}/${DB_ENV}/${GROUP}/${NEW}/${JOB_NAME}.json"
+#    elif [ -n "${GROUP}" ]; then
+#      JOB_CONFIG_RELATIVE_PATH="conf/datax/generated/${SRC_DST}/${PROJECT_LAYER_ENV}/${DB_ENV}/${GROUP}/${JOB_NAME}.json"
+#    elif [ -n "${DB_ENV}" ]; then
+#      JOB_CONFIG_RELATIVE_PATH="conf/datax/generated/${SRC_DST}/${PROJECT_LAYER_ENV}/${DB_ENV}/${JOB_NAME}.json"
+#    else
+#      JOB_CONFIG_RELATIVE_PATH="conf/datax/generated/${SRC_DST}/${PROJECT_LAYER_ENV}/${JOB_NAME}.json"
+#    fi
+    JOB_CONFIG_PATH="${BASE_DIR}/${JOB_CONFIG_RELATIVE_PATH}"
+  else
+    # 提供了`DataX作业配置文件`
+    if [ ! -f "${JOB_CONFIG_PATH}" ]; then
+      # 如果脚本不是在根目录下执行,相对路径的配置文件是找不到的,因此要变成绝对路径
+      JOB_CONFIG_RELATIVE_PATH=${JOB_CONFIG_PATH}
+      JOB_CONFIG_PATH="${BASE_DIR}/${JOB_CONFIG_RELATIVE_PATH}"
+    elif [[ "${JOB_CONFIG_PATH}" =~ "${BASE_DIR}".* ]]; then
+      # `DataX作业配置文件`是绝对路径
+      JOB_CONFIG_RELATIVE_PATH=${JOB_CONFIG_PATH#"${BASE_DIR}/"}
+    else
+      # 执行目录下可找到的相对路径,加上当前目录
+      JOB_CONFIG_PATH="${CURRENT_DIR}/${JOB_CONFIG_PATH}"
+      JOB_CONFIG_RELATIVE_PATH=${JOB_CONFIG_PATH#"${BASE_DIR}/"}
+    fi
+    #    PROJECT_LAYER_ENV=$(basename "$(dirname "${JOB_CONFIG_PATH}")")
+    #    SRC_DST=$(basename "$(dirname "$(dirname "${JOB_CONFIG_PATH}")")")
+    TEMP=$(dirname "${JOB_CONFIG_PATH}")
+    TEMP=${TEMP#"${BASE_DIR}/"}
+    TEMP=${TEMP#"conf/datax/generated/"}
+    SRC_DST=$(echo "${TEMP}" | cut -d '/' -f1)
+    PROJECT_LAYER_ENV=$(echo "${TEMP}" | cut -d '/' -f2)
+    DB_ENV=$(echo "${TEMP}" | cut -d '/' -f3)
+    GROUP=$(echo "${TEMP}" | cut -d '/' -f4)
+    JOB_NAME=$(basename "${JOB_CONFIG_PATH}" .json)
+  fi
+  datax_run_command="${PYTHON3_PATH} -u ${DATAX_HOME}/bin/datax.py ${JOB_CONFIG_PATH}"
+}
+
+function check_data_exists() {
+  DATA_EXISTS="true"
+  if [[ "${JOB_CONFIG_PATH}" =~ ${BASE_DIR}/.*/.*hdfs[-_].* ]]; then
+    path=$(jq -r '.job.content[0].reader.parameter.path' "${JOB_CONFIG_PATH}")
+    if ! hadoop fs -test -e "${path}"; then
+      pretty_print "${NORM_MGT}HDFS 路径 ${NORM_GRN}${path}${NORM_MGT} 不存在,DataX不需要执行"
+      exit 0
+    fi
+    if ! hadoop fs -test -e "${path}/*"; then
+      pretty_print "${NORM_MGT}HDFS 路径 ${NORM_GRN}${path}${NORM_MGT} 中没有数据,DataX不需要执行"
+      DATA_EXISTS=""
+    fi
+    disk_usage=$(hadoop fs -du -s "${path}" | cut -d ' ' -f1)
+    if [ "${disk_usage}" -eq 0 ]; then
+      pretty_print "${NORM_MGT}HDFS 路径 ${NORM_GRN}${path}${NORM_MGT} 中没有数据,DataX不需要执行"
+      DATA_EXISTS=""
+    fi
+  fi
+}
+
+function parse_args() {
+  for index in $(seq 1 $#); do
+    arg=${*:index:1}
+    case $arg in
+    -c)
+      index=$((index + 1))
+      JOB_CONFIG_PATH="${*:index:1}"
+      ;;
+    -c=*)
+      JOB_CONFIG_PATH=${arg#*=}
+      ;;
+    -gc)
+      index=$((index + 1))
+      GENERATOR_CONFIG_PATH="${*:index:1}"
+      ;;
+    -gc=*)
+      GENERATOR_CONFIG_PATH=${arg#*=}
+      ;;
+    -start-date)
+      index=$((index + 1))
+      START_DATE="${*:index:1}"
+      ;;
+    -start-date=*)
+      START_DATE="${arg#*=}"
+      ;;
+    -stop-date)
+      index=$((index + 1))
+      STOP_DATE="${*:index:1}"
+      ;;
+    -stop-date=*)
+      STOP_DATE="${arg#*=}"
+      ;;
+    -host)
+      index=$((index + 1))
+      HOST="${*:index:1}"
+      ;;
+    -host=*)
+      HOST="${arg#*=}"
+      ;;
+    -random)
+      IS_RANDOM='true'
+      ;;
+    -skip-datax)
+      SKIP_DATAX='true'
+      ;;
+    -h | -H | --h | --H | --help)
+      usage 0
+      ;;
+    *) ;;
+    esac
+  done
+  pretty_print "${NORM_MGT}${0} 收到参数:${NORM_GRN}${*}"
+}
+
+function prepare() {
+  if [ -z "${JOB_CONFIG_PATH}" ] && [ -z "${GENERATOR_CONFIG_PATH}" ]; then
+    pretty_print "${NORM_RED}请使用 -gc 提供DataX作业配置生成器配置文件(.ini文件)或使用 -c 提供DataX作业配置文件(.json文件)"
+    usage 1
+  fi
+  if [ "$(uname)" = "Linux" ]; then
+    YESTERDAY=$(date -d '-1 day' +%Y%m%d)
+    TODAY=$(date +%Y%m%d)
+  else
+    YESTERDAY=$(date -v-1d +%Y%m%d)
+    TODAY=$(date +%Y%m%d)
+  fi
+  if [ -z "${START_DATE}" ]; then
+    START_DATE=${YESTERDAY}
+  fi
+  if [ -z "${STOP_DATE}" ]; then
+    STOP_DATE=${TODAY}
+  fi
+  if [ -n "${JOB_CONFIG_PATH}" ]; then
+    JOB_CONFIG_RELATIVE_PATH=${JOB_CONFIG_PATH#"${BASE_DIR}/"}
+    pretty_print "${NORM_MGT}使用配置文件 ${NORM_GRN}${JOB_CONFIG_RELATIVE_PATH} ${NORM_MGT}运行DataX作业"
+  fi
+  if [ -n "${GENERATOR_CONFIG_PATH}" ]; then
+    GENERATOR_CONFIG_RELATIVE_PATH=${GENERATOR_CONFIG_PATH#"${BASE_DIR}/"}
+    pretty_print "${NORM_MGT}使用DataX作业配置生成器配置文件 ${NORM_GRN}${GENERATOR_CONFIG_RELATIVE_PATH} ${NORM_MGT}运行DataX作业"
+  fi
+}
+
+function run_single_datax_job() {
+  generate_job_config
+  if [[ "${JOB_CONFIG_PATH}" =~ .*/hdfs-kafka/.* ]]; then
+    pretty_print "${NORM_GRN}写kafka的ini文件中,writer部分的注意事项:"
+    pretty_print "${NORM_GRN}1. column,与reader的column在顺序上应一一对应,可在此处实现列名的变换"
+    pretty_print "${NORM_GRN}2. columnType,支持的显式类型(类型书写不区分大小写,结构体子字段名称区分大小写)有:"
+    pretty_print "${NORM_GRN}(1)id,表示该字段的值是es document的id"
+    pretty_print "${NORM_GRN}(2)array<string>,如果字段在hive中的定义是array<string>,但其element是数值(包括0开头的字符串数值)、bool的,应显式指定其类型;其他element类型本就是string或者数值、bool,则可以不指明"
+    pretty_print "${NORM_GRN}(3)array<struct<?,?,?>>,如果字段在hive中的定义是array<struct<?,?,?>>,应当显式指定,其结构体属性的名称和类型,名称和类型的分隔符为@,属性间的分隔符为#,例如:array<struct<col1@bigint#col2@string>>,如果字段是已经序列化的命名结构体数组,则不需要指明类型"
+    pretty_print "${NORM_GRN}(4)struct<?,?,?>,如果字段在hive中的定义是struct<?,?,?>,应当显式指定,其结构体属性的名称和类型,名称和类型的分隔符为@,例如:struct<col1@bigint#col2@string>,如果字段是已经序列化的命名结构体,则不需要指明类型"
+    pretty_print "${NORM_GRN}(5)json,如果字段是序列化后的json,应指明需要进行反序列化"
+    pretty_print "${NORM_GRN}(6)其他类型,比如int、long、bigint、bool、double等,都不需要指明类型"
+    pretty_print "${NORM_GRN}3. columnMapping,同hive es mapping表tblproperties['es.mapping.names']"
+    pretty_print "${NORM_GRN}注:不管columnType还是结构体属性字段名称,都会使用columnMapping进行字段名称转换"
+    pretty_print "${NORM_RED}重要:array的element、struct的字段值,一定不要包含英文逗号,否则会出现切分的值不对的情况!!!"
+  fi
+  if [ "${selected_worker}" = "${CURRENT_HOST}" ]; then
+    if [ -z "${SKIP_DATAX}" ]; then
+      check_data_exists
+      if [ "${DATA_EXISTS}" = 'true' ]; then
+        pretty_print "${NORM_MGT}在 ${NORM_GRN}${selected_worker} ${NORM_MGT}上执行以下命令:
+                    ${NORM_GRN}${datax_run_command}"
+        echo -en "${NORM_GRN}"
+        # 本机执行
+        ${datax_run_command}
+      fi
+    else
+      pretty_print "${NORM_MGT}DataX job was set to skipped"
+    fi
+  else
+    if [ -z "${SKIP_DATAX}" ]; then
+      check_data_exists
+      if [ "${DATA_EXISTS}" = 'true' ]; then
+        pretty_print "${NORM_MGT}在 ${NORM_GRN}${CURRENT_HOST} ${NORM_MGT}上执行以下命令:
+                    ${NORM_GRN}ssh ${RELEASE_USER}@${selected_worker} ${datax_run_command}"
+        echo -en "${NORM_GRN}"
+        # shellcheck disable=SC2029
+        ssh  "${selected_worker}" "${datax_run_command}"
+      fi
+    else
+      pretty_print "${NORM_MGT}DataX job was set to skipped"
+    fi
+  fi
+}
+parse_args "${@}"
+prepare
+select_worker
+run_single_datax_job

+ 83 - 0
bin/dingtalk-work-alert.sh

@@ -0,0 +1,83 @@
+#!/bin/bash
+#--------------------------------------------------------------------------------------------------
+# 使用企业微信群机器人发送告警的脚本
+# 腾讯云数仓机器人:-key=19e30ec1-d001-4437-ac41-63dc07f78520
+# 小可爱:-key=cc3653b1-78cb-465a-bf95-bf5f5303a37a
+#--------------------------------------------------------------------------------------------------
+BASE_DIR=$(
+  cd "$(dirname "$(realpath "$0")")/.." || exit
+  pwd
+)
+. "${BASE_DIR}"/bin/common/init.sh
+function usage() {
+  echo -e "${NORM_MGT}Usage: $0
+  ${NORM_CYN}\t[-h/-H/--h/--H/--help]             打印脚本使用方法${DO_RESET}"
+  echo -e "${NORM_MGT}Usage: $0
+  ${NORM_GRN}\t<-key[ /=] robot hook key>         机器人url后的key(lxy/common/alerter_constants.py中有记录)
+  ${NORM_GRN}\t<-msg[=/] message need to send>    要发送的消息
+  ${NORM_GRN}\t<-f[=/] file message need to send> 要发送的文件消息
+  ${DO_RESET}"
+  exit "$1"
+}
+
+function parse_args() {
+  for index in $(seq 1 $#); do
+    arg=${*:index:1}
+    case $arg in
+    -key)
+      index=$((index + 1))
+      KEY="${*:index:1}"
+      ;;
+    -key=*)
+      KEY="${arg#*=}"
+      ;;
+    -msg)
+      index=$((index + 1))
+      MSG+=("${*:index:1}")
+      ;;
+    -msg=*)
+      MSG+=("${arg#*=}")
+      ;;
+    -f)
+      index=$((index + 1))
+      FILE_PATH+=("${*:index:1}")
+      ;;
+    -f=*)
+      FILE_PATH+=("${arg#*=}")
+      ;;
+    -h | -H | --help)
+      usage 0
+      ;;
+    *) ;;
+    esac
+  done
+}
+
+function build_message() {
+  if [ -z "${KEY}" ] || [ "${#MSG[@]}" -eq 0 ]; then
+    usage 1
+  fi
+  msg=${MSG[0]}
+  for ((i = 1; i < ${#MSG[@]}; i++)); do
+    msg="${msg}\n${MSG[$i]}"
+  done
+  message=("{
+	  \"msgtype\": \"text\",
+	  \"text\": {
+		  \"content\": \"异常告警:\n${msg[*]}\"
+	  },
+	  \"at\":{
+	    \"isAtAll\":true
+	  }
+  }")
+  url="http://m1.node.cdh/dingtalk/api/robot/send?access_token=${KEY}"
+}
+
+# shellcheck disable=SC2034
+AT=()
+MSG=()
+parse_args "${@}"
+build_message
+
+#echo -e "${NORM_GRN} Send message using ${RED}${url}${DO_RESET}"
+curl "$url" -H 'Content-Type: application/json' -d "${message[*]}"

+ 122 - 0
bin/excel_to_hive.py

@@ -0,0 +1,122 @@
+#!/usr/bin/env /usr/bin/python3
+# -*- coding:utf-8 -*-
+"""
+  -- 设置 SparkSession 名称(血缘分析)
+  SET spark.app.name=excel_to_hive;
+  -- 设置 Spark 配置
+  SET spark.xxx.yyy.zzz=xyz;
+  -- 引用 UDF
+  ADD FILE dw_base/spark/udf/spark_xxx_udf.py;
+  -- 声明变量
+  SET TOPIC=xxx;
+  -- 查看数据行数
+  SET LIMIT=1000;
+"""
+import json
+import os
+import sys
+import argparse
+import re
+import pandas as pd
+from pyspark.sql import SparkSession
+
+base_dir = os.path.dirname(os.path.dirname(os.path.abspath(__file__)))
+sys.path.append(base_dir)
+
+
+class Excel2HiveUtil:
+    def __init__(self):
+        self.base_dir = base_dir
+
+    def run(self, excel_position: str, db_table: str, topic: str, dt: str, skip_rows=0):
+        spark = SparkSession.builder \
+            .appName("ExcelToHive") \
+            .master("yarn") \
+            .config('hive.exec.orc.default.block.size', 134217728) \
+            .config('spark.debug.maxToStringFields', 5000) \
+            .config('spark.dynamicAllocation.enabled', False) \
+            .config('spark.files.ignoreCorruptFiles', True) \
+            .config('spark.sql.adaptive.enabled', 'true') \
+            .config('spark.sql.broadcastTimeout', -1) \
+            .config('spark.sql.codegen.wholeStage', 'false') \
+            .config('spark.sql.execution.arrow.enabled', True) \
+            .config('spark.sql.execution.arrow.fallback.enabled', True) \
+            .config('spark.sql.files.ignoreCorruptFiles', True) \
+            .config('spark.sql.statistics.fallBackToHdfs', True) \
+            .config('hive.exec.dynamic.partition.mode', 'nonstrict') \
+            .config('spark.yarn.queue', "default") \
+            .enableHiveSupport().getOrCreate()
+
+        if not excel_position.startswith('/'):
+            full_path = os.path.join(self.base_dir, excel_position)
+        else:
+            full_path = excel_position
+
+        # 读取 Excel 数据
+        excel_data = pd.read_excel(full_path, dtype=str, skiprows=skip_rows)
+        excel_data.fillna(value='', inplace=True)
+
+        json_data = excel_data.apply(
+            lambda row: json.dumps({f'col{i + 1}': str(re.sub(r'[\n\t]', '', val)) for i, val in enumerate(row)}),
+            axis=1)
+
+        pandas_df = pd.DataFrame({'ori_json': json_data})
+        pandas_df['dt'] = dt
+        pandas_df['topic'] = topic
+
+        spark_df = spark.createDataFrame(pandas_df)
+
+        # # Write data to Hive table with partitions
+        # spark_df.write.mode('overwrite').partitionBy("dt", "topic").saveAsTable(db_table)
+
+        spark_df.createOrReplaceTempView("temp_view")
+        spark.sql(
+            f"""
+                    INSERT OVERWRITE TABLE {db_table} PARTITION (dt='{dt}', topic='{topic}')
+                    SELECT ori_json FROM temp_view
+                """)
+
+        spark.stop()
+
+
+def usage():
+    print(
+        f'Usage: {sys.argv[0]}\n'
+        f'  [-h/--help]                     打印脚本使用方法\n'
+        f'  [-p/--path <excel_file_path>]   要转换的 Excel 文件路径(必填)\n'
+        f'  [-t/--topic <topic>]            要插入到 Hive 表中的 topic(必填)\n'
+        f'  [-s/--skip <skip_rows>]         要跳过的 Excel 文件的行数(可选,默认为 0)\n'
+        f'  [-d/--dt <date>]                要插入到 Hive 表中的日期(可选,默认为 19700101)\n'
+    )
+    exit(1)
+
+
+def parse_args():
+    parser = argparse.ArgumentParser(description='Excel to Hive')
+    parser.add_argument('-p', '--path', type=str, required=True, help='Path to the Excel file')
+    parser.add_argument('-t', '--topic', type=str, required=True, help='Topic name to insert into Hive table')
+    parser.add_argument('-s', '--skip', type=int, default=0, help='Number of rows to skip in the Excel file')
+    parser.add_argument('-d', '--dt', type=str, default='19700101',
+                        help='Date to insert into Hive table, default is 19700101')
+    args = parser.parse_args()
+
+    config = {
+        'path': args.path,
+        'topic': args.topic,
+        'skip': args.skip,
+        'dt': args.dt
+    }
+    return config
+
+
+if __name__ == '__main__':
+    config = parse_args()
+
+    excel_position = config['path']
+    db_table = 'ent_raw.manual_import_data'
+    topic = config['topic']
+    skip_rows = config['skip']
+    dt = config['dt']
+
+    Excel2HiveUtil().run(excel_position, db_table, topic, dt, skip_rows)
+    print("================= Transfer completed! =======================")

+ 194 - 0
bin/flume-control.sh

@@ -0,0 +1,194 @@
+#!/bin/bash
+#--------------------------------------------------------------------------------------------------
+# 启动单个flume-kafka-hdfs作业
+# 1. 要求配置文件`conf/flume/kafka-hdfs-${  }.properties`必须存在
+# 2. 可在console上查看到作业是否启动成功
+# 3. 可通过查看日志`${LOG_ROOT_DIR}/flume-agent/${TODAY}/${JOB_NAME}.log`来确定作业运行情况
+#--------------------------------------------------------------------------------------------------
+BASE_DIR=$(
+  cd "$(dirname "$(realpath "$0")")/.." || exit
+  pwd
+)
+. "${BASE_DIR}"/bin/common/init.sh
+LOG_ROOT_DIR="/opt/data/log"
+
+function usage() {
+  echo -e "${NORM_MGT}Usage: $0
+  ${NORM_CYN}\t[-h/-H/--h/--H/--help]                                                 打印脚本使用方法${DO_RESET}"
+  echo -e "${NORM_MGT}Usage: $0
+  ${NORM_GRN}\t<log|monitor|start|start-all|status|stop|stop-all|restart|restart-all> 程序操作:log、monitor、start、start-all、status、stop、stop-all、restart、restart-all
+  ${NORM_GRN}\t<config name>                                                          配置名称,配置文件名称要求为kafka-hdfs-<config>.properties
+  ${DO_RESET}"
+  exit "$1"
+}
+
+function log() {
+  status
+  if [ -z "${current_log_file}" ]; then
+    pretty_print "${NORM_RED}未找到任何有效的日志文件"
+    exit 1
+  else
+    tail -100f "${current_log_file}"
+  fi
+}
+
+function start() {
+  if [ "$(uname)" = "Linux" ]; then
+    TODAY=$(date +%Y%m%d)
+  else
+    TODAY=$(date +%Y%m%d)
+  fi
+  LOG_DIR="${LOG_ROOT_DIR}/flume-agent/${TODAY}"
+  LOG_FILE_PATH="${LOG_DIR}/${CONFIG_NAME}.log"
+  if [ ! -d "${LOG_DIR}" ]; then
+    mkdir -p "${LOG_DIR}"
+    pretty_print "${NORM_MGT}创建日志目录 ${NORM_GRN}${LOG_DIR}"
+  fi
+  count=$(ps -axo command | grep "${JOB_CONFIG_FILE_NAME}" | grep -v grep | wc -l)
+  if [ "${count}" -gt 0 ]; then
+    pretty_print "${NORM_RED}使用配置文件 ${NORM_GRN}${JOB_CONFIG_FILE_NAME} ${NORM_RED}的Flume作业已在运行中"
+  else
+    pretty_print "${NORM_MGT}使用作业名称 ${NORM_GRN}${CONFIG_NAME} ${NORM_MGT}启动Flume作业"
+    flume-ng agent \
+      -Xms256m -Xmx4g \
+      --conf /etc/flume-ng/conf/ \
+      --conf-file "${JOB_CONFIG_FILE}" \
+      --name a1 \
+      -Dflume.root.logger=INFO,console >>"${LOG_FILE_PATH}" 2>&1 &
+    FLUME_APPLICATION_PID=$!
+    pretty_print "${NORM_MGT}Flume作业已启动,pid为 ${NORM_GRN}${FLUME_APPLICATION_PID}${NORM_MGT},日志文件为 ${NORM_GRN}${LOG_FILE_PATH}"
+  fi
+}
+
+function start-all() {
+  for JOB_CONFIG_FILE in ${BASE_DIR}/conf/flume/*.properties; do
+    JOB_CONFIG_FILE_NAME=$(basename ${JOB_CONFIG_FILE})
+    CONFIG_FULL_NAME=$(basename ${JOB_CONFIG_FILE_NAME} .properties)
+    CONFIG_NAME=$(echo "${CONFIG_FULL_NAME}" | sed "s/kafka-hdfs-//g")
+    start
+  done
+}
+
+function status() {
+  agent_pid=$(ps -axo pid,command | grep "${JOB_CONFIG_FILE_NAME}" | grep -v grep | awk -F ' ' '{print $1}')
+  if [ -n "${agent_pid}" ]; then
+    mapfile -t log_files < <(find "${LOG_ROOT_DIR}"/flume-agent -name "*${CONFIG_NAME}.log" | sort -r)
+    if [ "${#log_files[@]}" -gt 0 ]; then
+      current_log_file="${log_files[0]}"
+    fi
+    pretty_print "${NORM_MGT}Flume agent ${NORM_GRN}${CONFIG_NAME} ${NORM_MGT}is running at pid ${NORM_GRN}${agent_pid}"
+  else
+    pretty_print "${NORM_MGT}Flume agent ${NORM_GRN}${CONFIG_NAME} ${NORM_MGT}is not running"
+  fi
+}
+
+function stop() {
+  agent_pid=$(ps -axo pid,command | grep "${JOB_CONFIG_FILE_NAME}" | grep -v grep | awk -F ' ' '{print $1}')
+  if [ -z "${agent_pid}" ]; then
+    pretty_print "${NORM_MGT}Flume作业 ${NORM_GRN}${CONFIG_NAME} ${NORM_MGT}并未运行"
+    return
+  else
+    pretty_print "${NORM_MGT}停止Flume作业 ${NORM_GRN}${CONFIG_NAME}(${agent_pid})"
+    kill -15 "${agent_pid}"
+  fi
+  agent_pid=$(ps -axo pid,command | grep "${JOB_CONFIG_FILE_NAME}" | grep -v grep | awk -F ' ' '{print $1}')
+  if [ -z "${agent_pid}" ]; then
+    pretty_print "${NORM_MGT}Flume作业 ${NORM_GRN}${CONFIG_NAME} ${NORM_MGT}已停止"
+  else
+    pretty_print "${NORM_MGT}Flume作业 ${NORM_GRN}${CONFIG_NAME} ${NORM_MGT}停止失败"
+  fi
+}
+
+function stop-all() {
+  for JOB_CONFIG_FILE in ${BASE_DIR}/conf/flume/*.properties; do
+    JOB_CONFIG_FILE_NAME=$(basename ${JOB_CONFIG_FILE})
+    CONFIG_FULL_NAME=$(basename ${JOB_CONFIG_FILE_NAME} .properties)
+    CONFIG_NAME=$(echo "${CONFIG_FULL_NAME}" | sed "s/kafka-hdfs-//g")
+    stop
+  done
+}
+
+function monitor() {
+  while true; do
+    agent_pid=""
+    status
+    if [ -z "${agent_pid}" ]; then
+      "${BASE_DIR}"/bin/wechat-work-alert.sh \
+        -key="${SKB_LITTLE_CUTE}" \
+        -at=13917467529 \
+        -msg="$(date +'%Y-%m-%d %H:%M:%S') Flume agent (${CONFIG_NAME}) is not running"
+    else
+      pretty_print "${NORM_MGT}Monitor Flume agent by read log file ${NORM_GRN}${current_log_file}${NORM_MGT}"
+      if head -n 1000 "${current_log_file}" | grep -E "gz failed|java.io.IOException|org.apache.flume.ChannelException|java.lang.IllegalStateException"; then
+        "${BASE_DIR}"/bin/wechat-work-alert.sh \
+          -key="${SKB_LITTLE_CUTE}" \
+          -at=13917467529 \
+          -msg="$(date +'%Y-%m-%d %H:%M:%S') Flume agent (${CONFIG_NAME}) may not be running properly, please check log file ${current_log_file} to see what happened"
+      else
+        pretty_print "${NORM_MGT}Flume agent ${NORM_GRN}${CONFIG_NAME} ${NORM_MGT}is running properly"
+      fi
+    fi
+    if [ "$(date +%H)" = "00" ]; then
+      break
+    fi
+    pretty_print "${NORM_MGT}Waiting ${NORM_GRN}3600 ${NORM_MGT}seconds for the next check"
+    sleep 3600s
+  done
+}
+
+function pretty_print() {
+    # 设置文本颜色和格式
+    NORM_RED='\033[0;31m'  # 红色
+    NORM_GRN='\033[0;32m'  # 绿色
+    NORM_CYN='\033[0;36m'  # 青色
+    NORM_MGT='\033[0m'   # 重置颜色和格式
+    # 打印带颜色和格式的消息
+    echo -e "${1}"
+}
+
+function run() {
+  op="${1}"
+  if [ -z "${op}" ]; then
+    usage 1
+  fi
+  case ${op} in
+  log | monitor | start | status | stop | restart)
+    CONFIG_NAME="${2}"
+    pretty_print "${NORM_MGT}${0} 收到参数:${NORM_GRN}${*}"
+    if [ -z "${CONFIG_NAME}" ]; then
+      usage 1
+    fi
+    JOB_CONFIG_FILE_NAME="kafka-hdfs-${CONFIG_NAME}.properties"
+    JOB_CONFIG_FILE="${BASE_DIR}/conf/flume/config/${JOB_CONFIG_FILE_NAME}"
+    if [ ! -f "${JOB_CONFIG_FILE}" ]; then
+      pretty_print "${NORM_RED}Flume作业配置文件 ${NORM_GRN}${JOB_CONFIG_FILE} ${NORM_RED}不存在"
+      exit 1
+    fi
+    ;;
+  start-all | stop-all | restart-all) ;;
+  -h | -H | --h | --H | --help) usage 0 ;;
+  *)
+    pretty_print "${NORM_RED}Unsupported operation ${NORM_GRN}${op}"
+    usage 1
+    ;;
+  esac
+  case $op in
+  log) log ;;
+  monitor) monitor ;;
+  start) start ;;
+  start-all) start-all ;;
+  status) status ;;
+  stop) stop ;;
+  stop-all) stop-all ;;
+  restart)
+    stop
+    start
+    ;;
+  restart-all)
+    stop-all
+    start-all
+    ;;
+  esac
+}
+
+run "${@}"

+ 13 - 0
bin/hive-exec-job-starter.py

@@ -0,0 +1,13 @@
+#!/usr/bin/env /usr/bin/python3
+# -*- coding:utf-8 -*-
+"""
+  Note:为方便本地调试设计,请勿在调度中使用
+"""
+import os
+import sys
+
+project_root_dir = os.path.dirname(os.path.dirname(os.path.abspath(__file__)))
+sys.path.append(project_root_dir)
+
+if __name__ == '__main__':
+    os.system(f'{project_root_dir}/bin/hive-exec.sh {" ".join(sys.argv[1:])}')

+ 186 - 0
bin/hive-exec.sh

@@ -0,0 +1,186 @@
+#!/bin/bash
+#--------------------------------------------------------------------------------------------------
+#--------------------------------------------------------------------------------------------------
+set -e
+BASE_DIR=$(
+  cd "$(dirname "$(realpath "$0")")/.." || exit
+  pwd
+)
+. "${BASE_DIR}"/bin/common/init.sh
+function usage() {
+  echo -e "${NORM_MGT}Usage: $0
+  ${NORM_GRN}\t<-e[ /=]HQL语句>   HQL语句,需要使用''包括sql语句
+  ${NORM_CYN}\t[-dt[ /=]日期]     %Y%m%d 或 yyyyMMdd 格式的日期(命令行 > 默认)
+  ${NORM_CYN}\t                  可以以四种形式传入日期:
+  ${NORM_CYN}\t                      1. 20211101,表示具体日期
+  ${NORM_CYN}\t                      2. 20211101-,表示20211101至昨天
+  ${NORM_CYN}\t                      3. 20211101-20211107,表示20211101至20211107
+  ${NORM_CYN}\t                      4. 20211101,20211103,表示离散的日期20211101、20211103
+  ${NORM_CYN}\t[-c 参数名:参数值]  Hive参数
+  ${NORM_CYN}\t[-v 变量名:变量值]  Hive变量
+  ${DO_RESET}"
+  echo -e "${NORM_MGT}Usage: $0
+  ${NORM_GRN}\t<-f[ /=]HQL文件>   HQL文件
+  ${NORM_CYN}\t[-dt[ /=]日期]     %Y%m%d 或 yyyyMMdd 格式的日期(命令行 > 默认)
+  ${NORM_CYN}\t                  可以以四种形式传入日期:
+  ${NORM_CYN}\t                      1. 20211101,表示具体日期
+  ${NORM_CYN}\t                      2. 20211101-,表示20211101至昨天
+  ${NORM_CYN}\t                      3. 20211101-20211107,表示20211101至20211107
+  ${NORM_CYN}\t                      4. 20211101,20211103,表示离散的日期20211101、20211103
+  ${NORM_CYN}\t[-c 参数名:参数值]  Hive参数
+  ${NORM_CYN}\t[-v 变量名:变量值]  Hive变量
+  ${DO_RESET}"
+  exit "$1"
+}
+
+function parse_args() {
+  for index in $(seq 1 $#); do
+    arg=${*:index:1}
+    case $arg in
+    -c)
+      index=$((index + 1))
+      HIVE_CONF+=("--hiveconf")
+      HIVE_CONF+=("${*:index:1}")
+      ;;
+    -c=*)
+      HIVE_CONF+=("--hiveconf")
+      HIVE_CONF+=("${arg#*=}")
+      ;;
+    -dt)
+      index=$((index + 1))
+      if [ -z "${DT}" ]; then
+        DT="${*:index:1}"
+      fi
+      ;;
+    -dt=*)
+      if [ -z "${DT}" ]; then
+        DT="${arg#*=}"
+      fi
+      ;;
+    -v)
+      index=$((index + 1))
+      # 例如:dt=20220101、dt:20220101
+      KEY_VALUE="${*:index:1}"
+      # 截取 dt
+      KEY="${KEY_VALUE%%[:|=]*}"
+      # 截取 20220101
+      VALUE="${KEY_VALUE#*[:|=]}"
+      if [ "${KEY}" = "dt" ]; then
+        if [ -z "${DT}" ]; then
+          DT="${VALUE}"
+        fi
+      else
+        HIVE_GLOBAL_VAR+=("--hivevar")
+        HIVE_GLOBAL_VAR+=("${KEY_VALUE}")
+      fi
+      ;;
+    -v=*)
+      KEY_VALUE="${arg#*=}"
+      KEY="${KEY_VALUE%%[:|=]*}"
+      VALUE="${KEY_VALUE#*[:|=]}"
+      if [ "${KEY}" = "dt" ]; then
+        if [ -z "${DT}" ]; then
+          DT="${VALUE}"
+        fi
+      else
+        HIVE_GLOBAL_VAR+=("--hivevar")
+        HIVE_GLOBAL_VAR+=("${KEY_VALUE}")
+      fi
+      ;;
+    -e)
+      index=$((index + 1))
+      HIVE_SQL="${*:index:1}"
+      ;;
+    -e=*)
+      HIVE_SQL="${arg#*=}"
+      ;;
+    -f)
+      index=$((index + 1))
+      HIVE_FILE="${*:index:1}"
+      ;;
+    -f=*)
+      HIVE_FILE="${arg#*=}"
+      ;;
+    -h | -H | --h | --H | --help)
+      usage 0
+      ;;
+    *) ;;
+
+    esac
+  done
+  pretty_print "${NORM_MGT}${0} 收到参数:${NORM_GRN}${*}"
+}
+
+function run_execute() {
+  if [ -n "${HIVE_SQL}" ]; then
+    pretty_print "${NORM_MGT}执行Shell命令 ${NORM_GRN}hive -e ${HIVE_SQL} ${HIVE_CONF[*]} ${HIVE_LOCAL_VAR[*]}"
+    # 执行HQL语句
+    hive -e "${HIVE_SQL}" "${HIVE_CONF[@]}" "${HIVE_LOCAL_VAR[@]}" 2>&1 | tee -a "${LOG_FULL_PATH}"
+    exit "${PIPESTATUS[0]}"
+  elif [ -n "${HIVE_FILE}" ]; then
+    # 执行HQL文件
+    pretty_print "${NORM_MGT}执行Shell命令 ${NORM_GRN}hive -f ${HIVE_FILE} ${HIVE_CONF[*]} ${HIVE_LOCAL_VAR[*]}"
+    if [ "${USER}" == "${RELEASE_USER}" ]; then
+      hive -f "/home/${USER}/release/tendata-warehouse/${HIVE_FILE}" "${HIVE_CONF[@]}" "${HIVE_LOCAL_VAR[@]}" 2>&1 | tee -a "${LOG_FULL_PATH}"
+    else
+      hive -f "/home/${USER}/tendata-warehouse/${HIVE_FILE}" "${HIVE_CONF[@]}" "${HIVE_LOCAL_VAR[@]}" 2>&1 | tee -a "${LOG_FULL_PATH}"
+    fi
+    EXIT_CODE="${PIPESTATUS[0]}"
+    if [ "${EXIT_CODE}" -ne 0 ]; then
+      if [[ "${HIVE_FILE}" =~ .*stg_es_mapping.sql ]]; then
+        exit $((EXIT_CODE))
+      fi
+      if [[ "${HIVE_FILE}" =~ .*stage_es_mapping.sql ]]; then
+        exit $((EXIT_CODE))
+      fi
+      # RELEASE_USER="dev005"
+      if [ "${USER}" == "${RELEASE_USER}" ]; then
+        DINGTALK_ALTER_KEY="4eb576296e66f49628447c8f2931c8892583f3283c96fef872577148aa5f88fa"
+        MESSAGE="在 ${CURRENT_HOST} 上执行HQL文件 /home/${USER}/tendata-warehouse/${HIVE_FILE} 失败"
+        "${BASE_DIR}"/bin/dingtalk-work-alert.sh -key="${DINGTALK_ALTER_KEY}" -msg="${MESSAGE}"
+      else
+        pretty_print "${NORM_MGT}执行HQL文件 ${NORM_GRN}${HIVE_FILE}${NORM_MGT} 失败"
+      fi
+      exit $((EXIT_CODE))
+    fi
+  else
+    usage 1
+  fi
+}
+
+function pretty_print() {
+    # 设置文本颜色和格式
+    NORM_GRN='\033[0;32m'  # 绿色
+    NORM_CYN='\033[0;36m'  # 青色
+    NORM_MGT='\033[0m'   # 重置颜色和格式
+    # 打印带颜色和格式的消息
+    echo -e "${1}"
+}
+
+
+HIVE_CONF=()
+HIVE_GLOBAL_VAR=()
+HIVE_SQL=""
+HIVE_FILE=""
+parse_args "${@}"
+if [ -z "${DT}" ]; then
+  DT=$(date -d '-1 day' +%Y%m%d)
+fi
+date_range "${DT}"
+for DT in "${DATE_RANGE[@]}"; do
+  HIVE_LOCAL_VAR=("${HIVE_GLOBAL_VAR[@]}")
+  HIVE_LOCAL_VAR+=("--hivevar")
+  HIVE_LOCAL_VAR+=("dt=${DT}")
+  LOG_DIR="${LOG_ROOT_DIR}/hive-exec/${DT}"
+  if [ -n "${HIVE_SQL}" ]; then
+    HIVE_FILE_SIMPLE_NAME=$(echo "${HIVE_SQL}" | base64)
+    LOG_FILE_NAME="${HIVE_FILE_SIMPLE_NAME}.log"
+  elif [ -n "${HIVE_FILE}" ]; then
+    HIVE_FILE_SIMPLE_NAME=$(basename "${HIVE_FILE}" .sql)
+    LOG_FILE_NAME="${HIVE_FILE_SIMPLE_NAME}.log"
+  fi
+  mkdir -p "${LOG_DIR}"
+  LOG_FULL_PATH="${LOG_DIR}/${LOG_FILE_NAME}"
+  pretty_print "${NORM_MGT}日志文件将写入 ${NORM_GRN}${LOG_FULL_PATH}${NORM_MGT}"
+  run_execute
+done

+ 191 - 0
bin/spark-sql-starter.py

@@ -0,0 +1,191 @@
+#!/usr/bin/env /usr/bin/python3
+# -*- coding:utf-8 -*-
+"""
+  -- 设置 SparkSession 名称(血缘分析)
+  SET spark.app.name=crl_dwd.dwd_crl_xxx;
+  -- 设置 Spark 配置
+  SET spark.xxx.yyy.zzz=xyz;
+  -- 引用 UDF
+  ADD FILE dw_base/spark/udf/spark_xxx_udf.py;
+  -- 声明变量
+  SET DT_START=20210101;
+  SET TOPIC=xxx;
+  SET KEY=CourtAnn;
+  SET ODS_TABLE=crl_ods.ods_crl_xxx;
+  -- 查看数据行数
+  SET LIMIT=1000;
+"""
+import os
+import sys
+from typing import Dict, List
+
+base_dir = os.path.dirname(os.path.dirname(os.path.abspath(__file__)))
+sys.path.append(base_dir)
+from dw_base import *
+from dw_base.common.config_constants import K_DT, KL_HELP
+from dw_base.common.container import ValueContainer
+from dw_base.spark.spark_sql import SparkSQL
+from dw_base.spark.spark_utils import analyse_session_name
+from dw_base.utils.common_utils import exist
+from dw_base.utils.config_utils import parse_args
+from dw_base.utils.datetime_utils import get_yesterday, get_date_range
+from dw_base.utils.file_utils import list_files
+from dw_base.utils.log_utils import pretty_print, CURRENT_LOG_FILE, get_log_file_path
+
+
+def usage(code: int):
+    print(
+        f'{NORM_MGT}Usage: {sys.argv[0]}\n'
+        f'{NORM_CYN}\t[-h/-H/--h/--H/--help]         打印脚本使用方法{DO_RESET}'
+    )
+    print(
+        f'{NORM_MGT}Usage: {sys.argv[0]}\n'
+        f'{NORM_GRN}\t<[-]-f< /=>sql file>           要执行的SQL文件,可传入多个,按传入顺序执行,与参数-d互斥\n'
+        f'{NORM_GRN}\t<[-]-d< /=>sql file directory> 要执行的SQL文件夹,可传入多个,按传入顺序执行\n'
+        f'{NORM_CYN}\t[[-]-u< /=>udf file]           被引用到的udf文件(.py),注意多个udf文件内定义的方法有重名时,只会取靠前的\n'
+        f'{NORM_CYN}\t[[-]-sc spark-config]          Spark自定义配置,格式为config:config-value或config=config-value\n'
+        f'{NORM_CYN}\t[[-]-dt< /=>date with format]  %Y%m%d 或 yyyyMMdd 格式的日期(命令行 > 默认)\n'
+        f'{NORM_CYN}\t                               可以以四种形式传入日期:\n'
+        f'{NORM_CYN}\t                                   1. 20211101,表示具体日期\n'
+        f'{NORM_CYN}\t                                   2. 20211101-,表示20211101至昨天\n'
+        f'{NORM_CYN}\t                                   3. 20211101-20211107,表示20211101至20211107\n'
+        f'{NORM_CYN}\t                                   4. 20211101,20211103,表示离散的日期20211101、20211103\n'
+        f'{NORM_CYN}\t[[-]-p sql-parameter]          SQL脚本的参数,格式为parameter:parameter-value或parameter=parameter-value\n'
+        f'{NORM_CYN}\t[[-]-n]                        在串行执行多个SQL文件时,一个失败并不影响后续文件的执行,但最后程序会非正常结束,即返回非0值(失败的文件个数)\n'
+        f'{NORM_CYN}\t[[-]-0]                        在串行执行多个SQL文件时,一个失败并不影响后续文件的执行,且最后程序正常结束(请谨慎使用)\n'
+        f'{NORM_CYN}\t                               上述两个参数都不传的情况下,某个SQL文件执行失败时,后续文件都不再执行'
+        f'{DO_RESET}'
+    )
+    exit(code)
+
+
+def parse_cli_spark_config() -> Dict[str, str]:
+    cli_spark_config = {}
+    spark_configs = CONFIG.get('sc', [])
+    if isinstance(spark_configs, str):
+        spark_configs = [spark_configs]
+    for elem in spark_configs:
+        if elem.__contains__('='):
+            spark_config, config_value = elem.split('=')
+        elif elem.__contains__(':'):
+            spark_config, config_value = elem.split(':')
+        else:
+            pretty_print(f'{NORM_YEL}无效的 Spark 配置 {NORM_GRN}{elem}')
+            continue
+        if cli_spark_config.__contains__(spark_config):
+            pretty_print(f'{NORM_YEL}命令行多次传入了 Spark 配置 {NORM_GRN}{spark_config}'
+                         f'{NORM_YEL}, 原值 {NORM_GRN}{cli_spark_config[spark_config]} '
+                         f'{NORM_YEL}将被覆盖为新值 {NORM_GRN}{config_value}')
+        cli_spark_config[spark_config] = config_value
+    return cli_spark_config
+
+
+def parse_sql_parameters() -> Dict[str, str]:
+    sql_parameters = {}
+    parameter_configs = CONFIG.get('p', [])
+    if isinstance(parameter_configs, str):
+        parameter_configs = [parameter_configs]
+    for elem in parameter_configs:
+        if elem.__contains__('='):
+            parameter_key, parameter_value = elem.split('=')
+        elif elem.__contains__(':'):
+            parameter_key, parameter_value = elem.split(':')
+        else:
+            pretty_print(f'{NORM_YEL}无效的 SQL 参数 {NORM_GRN}{elem}')
+            continue
+        if parameter_key == 'dt':
+            continue
+        if sql_parameters.__contains__(parameter_key):
+            pretty_print(f'{NORM_YEL}命令行多次传入了 SQL 参数 {NORM_GRN}{parameter_key}'
+                         f'{NORM_YEL}, 原值 {NORM_GRN}{sql_parameters[parameter_key]} '
+                         f'{NORM_YEL}将被覆盖为新值 {NORM_GRN}{parameter_value}')
+        sql_parameters[parameter_key] = parameter_value
+    return sql_parameters
+
+
+def run_sql_file(sql_file: str, dt: str, is_last: bool):
+    try:
+        spark_session_name = CONFIG.get('ssn', analyse_session_name(sql_file))
+        sql_file_name = os.path.splitext(os.path.basename(sql_file))[0]
+        CURRENT_LOG_FILE.set(get_log_file_path('spark-sql', dt, sql_file_name))
+        spark_sql = SparkSQL(spark_session_name, udf_files=udf_files, extra_spark_config=extra_spark_config)
+        is_export = CONFIG.get('export')
+        if is_last and is_export is True:
+            sql_file_name = os.path.splitext(os.path.basename(sql_file))[0]
+            delimiter = CONFIG.get('delimiter', ',')
+            spark_sql.export_data(sql_file_name, sql_file, truncate=False,
+                                  delimiter=delimiter, dt=dt, partition=1, **sql_parameters)
+        else:
+            spark_sql.execute(sql_file, check_parameter=True, dt=dt, **sql_parameters)
+    except Exception as e:
+        message = f'执行SQL文件 {sql_file} 失败(dt={dt})'
+        if IS_RUN_IN_RELEASE_DIR and IS_RUN_BY_RELEASE_USER:
+            message += f'(可访问网址 http://{HOST}/log/spark-sql/{dt}/{os.path.basename(CURRENT_LOG_FILE.get())} 查看日志)'
+            print(message)
+        ERROR_COUNT.set(ERROR_COUNT.get() + 1)
+        if CONTINUE_ALL_ON_ERROR and CONTINUE_ALL_ON_ERROR is True:
+            return
+        elif CONTINUE_NEXT_ON_ERROR and CONTINUE_NEXT_ON_ERROR is True:
+            return
+        else:
+            raise e
+
+
+def run_sql_files(sql_files: List[str], dt: str):
+    for index in range(len(sql_files)):
+        sql_file = sql_files[index]
+        if sql_file == 'create_table.sql' or sql_file == 'create-table.sql':
+            continue
+        sql_file_dir = os.path.basename(os.path.basename(sql_file))
+        if sql_file_dir == 'create_table' or sql_file_dir == 'create-table':
+            continue
+        run_sql_file(sql_file, dt, len(sql_files) == index + 1)
+
+
+def run_sql_directories(dt: str):
+    directories = CONFIG.get('d', [])
+    if isinstance(directories, str):
+        directories = [directories]
+    for each_dir in directories:
+        sql_files_in_dir = list_files(each_dir, extension='.sql')
+        sql_files_in_dir = sorted(sql_files_in_dir)
+        if len(sql_files_in_dir) == 0:
+            pretty_print(f'{NORM_RED}文件夹 {NORM_GRN}{each_dir}{NORM_RED} 中未找到任何可执行的SQL文件')
+            usage(1)
+        run_sql_files(sql_files_in_dir, dt)
+
+
+if __name__ == '__main__':
+    pretty_print(f'{NORM_MGT}{sys.argv[0]} 收到参数:{NORM_GRN}{" ".join(sys.argv[1:])}')
+
+    CONFIG, _ = parse_args(sys.argv[1:])
+    # 未提供任何参数或查看帮助
+    if len(sys.argv) == 1 or exist(CONFIG, KL_HELP):
+        usage(0)
+    CONTINUE_NEXT_ON_ERROR = CONFIG.get('n')
+    CONTINUE_ALL_ON_ERROR = CONFIG.get('0')
+    ERROR_COUNT = ValueContainer(0)
+    date_range = get_date_range(CONFIG.get(K_DT, get_yesterday()))
+    udf_files = CONFIG.get('u', [])
+    if isinstance(udf_files, str):
+        udf_files = [udf_files]
+    if not udf_files.__contains__(COMMON_SPARK_UDF_FILE):
+        udf_files.insert(0, COMMON_SPARK_UDF_FILE)
+    extra_spark_config = parse_cli_spark_config()
+    sql_parameters = parse_sql_parameters()
+    if CONFIG.__contains__('f'):
+        arg_sql_files = CONFIG.get('f', [])
+        if isinstance(arg_sql_files, str):
+            arg_sql_files = [arg_sql_files]
+        if len(arg_sql_files) == 0:
+            pretty_print(f'{NORM_RED}未提供任何可执行的SQL文件')
+            usage(1)
+        for each_dt in date_range:
+            run_sql_files(arg_sql_files, each_dt)
+    elif CONFIG.__contains__('d'):
+        for each_dt in date_range:
+            run_sql_directories(each_dt)
+    else:
+        usage(0)
+    if not CONTINUE_ALL_ON_ERROR or CONTINUE_ALL_ON_ERROR is False:
+        exit(ERROR_COUNT.get())

+ 0 - 0
conf/.gitkeep


+ 123 - 0
dw_base/__init__.py

@@ -0,0 +1,123 @@
+#!/usr/bin/env /usr/bin/python3
+# -*- coding:utf-8 -*-
+
+import os
+import socket
+import sys
+import time
+
+import findspark
+
+
+def cow_says():
+    os.system(f'source {PROJECT_ROOT_PATH}/bin/common/functions.sh')
+
+
+os.environ['HADOOP_CONF_DIR'] = '/etc/hadoop/conf'
+# os.environ['HIVE_CONF_DIR'] = '/etc/hive/conf'
+# os.environ['JAVA_HOME'] = '/usr/local/java'
+os.environ["PYSPARK_DRIVER_PYTHON"] = "/usr/bin/python3"
+os.environ["PYSPARK_PYTHON"] = "/usr/bin/python3"
+# os.environ['SPARK_HOME'] = '/usr/hdp/3.1.5.0-152/spark2'
+os.environ['PYTHONUNBUFFERED'] = 'x'
+PROJECT_ROOT_PATH = os.path.abspath(os.path.dirname(os.path.dirname(__file__)))
+PROJECT_NAME = os.path.basename(PROJECT_ROOT_PATH)
+sys.path.append(PROJECT_ROOT_PATH)
+# 公用的Spark UDF文件
+COMMON_SPARK_UDF_FILE = 'dw_base/spark/udf/spark_common_udf.py'
+BANNED_USER = 'root'
+RELEASE_USER = 'alvis'
+USER = os.environ['USER']
+HOME = os.environ['HOME']
+if USER == BANNED_USER and HOME.startswith('/home'):
+    USER = os.path.basename(HOME)
+HOST = socket.gethostname()
+RELEASE_ROOT_DIR = '/home/alvis/release'
+
+if not PROJECT_ROOT_PATH.startswith(RELEASE_ROOT_DIR) or USER != RELEASE_USER:
+    DO_RESET: str = '\033[0m'
+    CHG_BOLD: str = '\033[1m'
+    NORM_RED: str = '\033[0;31m'
+    NORM_GRN: str = '\033[0;32m'
+    NORM_YEL: str = '\033[0;33m'
+    NORM_BLU: str = '\033[0;34m'
+    NORM_MGT: str = '\033[0;35m'
+    NORM_CYN: str = '\033[0;36m'
+    NORM_WHT: str = '\033[0;37m'
+
+    BOLD_RED: str = '\033[1;31m'
+    BOLD_GRN: str = '\033[1;32m'
+    BOLD_YEL: str = '\033[1;33m'
+    BOLD_BLU: str = '\033[1;34m'
+    BOLD_MGT: str = '\033[1;35m'
+    BOLD_CYN: str = '\033[1;36m'
+    BOLD_WHT: str = '\033[1;37m'
+
+    BGRD_RED: str = '\033[41m'
+    BGRD_GRN: str = '\033[42m'
+    BGRD_YEL: str = '\033[43m'
+    BGRD_BLU: str = '\033[44m'
+    BGRD_MGT: str = '\033[45m'
+    BGRD_CYN: str = '\033[46m'
+    BGRD_WHT: str = '\033[47m'
+else:
+    DO_RESET: str = ''
+    CHG_BOLD: str = ''
+    NORM_RED: str = ''
+    NORM_GRN: str = ''
+    NORM_YEL: str = ''
+    NORM_BLU: str = ''
+    NORM_MGT: str = ''
+    NORM_CYN: str = ''
+    NORM_WHT: str = ''
+
+    BOLD_RED: str = ''
+    BOLD_GRN: str = ''
+    BOLD_YEL: str = ''
+    BOLD_BLU: str = ''
+    BOLD_MGT: str = ''
+    BOLD_CYN: str = ''
+    BOLD_WHT: str = ''
+
+    BGRD_RED: str = ''
+    BGRD_GRN: str = ''
+    BGRD_YEL: str = ''
+    BGRD_BLU: str = ''
+    BGRD_MGT: str = ''
+    BGRD_CYN: str = ''
+    BGRD_WHT: str = ''
+IS_RUN_BY_RELEASE_USER = False
+IS_RUN_BY_NORMAL_USER = False
+if USER == RELEASE_USER:
+    LOG_ROOT_DIR = "/opt/data/log"
+    IS_RUN_BY_RELEASE_USER = True
+elif USER == BANNED_USER:
+    ERROR_CODE = 18
+    print(f'{NORM_MGT}{time.strftime("%Y-%m-%d %H:%M:%S", time.localtime())} '
+          f'{NORM_RED}Project {NORM_GRN}{PROJECT_NAME} '
+          f'{NORM_RED}is running by banned user {NORM_GRN}{BANNED_USER}'
+          f'{NORM_RED}, exit with error code {NORM_GRN}{ERROR_CODE}'
+          f'{DO_RESET}')
+    exit(ERROR_CODE)
+else:
+    IS_RUN_BY_NORMAL_USER = True
+    LOG_ROOT_DIR = f'{HOME}/data/log'
+    cow_says()
+    print(f'{NORM_CYN}{time.strftime("%Y-%m-%d %H:%M:%S", time.localtime())} '
+          f'{NORM_MGT}Project {NORM_GRN}{PROJECT_NAME} '
+          f'{NORM_MGT}is running in normal user {NORM_GRN}{USER}')
+if PROJECT_ROOT_PATH.startswith(f'{RELEASE_ROOT_DIR}/{PROJECT_NAME}'):
+    IS_RUN_IN_RELEASE_DIR = True
+    print(f'{NORM_CYN}{time.strftime("%Y-%m-%d %H:%M:%S", time.localtime())} '
+          f'{NORM_MGT}Project {NORM_GRN}{PROJECT_NAME} '
+          f'{NORM_MGT}is running in release dir {NORM_GRN}{RELEASE_ROOT_DIR}/{PROJECT_NAME}')
+else:
+    IS_RUN_IN_RELEASE_DIR = False
+    print(f'{NORM_CYN}{time.strftime("%Y-%m-%d %H:%M:%S", time.localtime())} '
+          f'{NORM_MGT}Project {NORM_GRN}{PROJECT_NAME} '
+          f'{NORM_MGT}is running in normal user dir {NORM_GRN}{PROJECT_ROOT_PATH}')
+
+if not IS_RUN_IN_RELEASE_DIR or USER != RELEASE_USER:
+    os.system(f'echo -en "{NORM_GRN}"')
+os.system(f'echo -en "{DO_RESET}"')
+findspark.init()

+ 0 - 0
dw_base/common/__init__.py


+ 14 - 0
dw_base/common/alerter_constants.py

@@ -0,0 +1,14 @@
+#!/usr/bin/env /usr/bin/python3
+# -*- coding:utf-8 -*-
+
+WECHAT_WORK_WEB_HOOK = 'https://qyapi.weixin.qq.com/cgi-bin/webhook/send?key='
+# 商分告警
+BA_LITTLE_CUTE='4c95be61-c6ae-47e3-adf7-616a8879b383'
+# 中台业务调度告警
+DCP_LITTLE_CUTE = 'cc3653b1-78cb-465a-bf95-bf5f5303a37a'
+# 中台ETL告警
+ETL_LITTLE_CUTE = 'ddeabb01-49e4-4558-9199-855b014b4835'
+# 搜客宝数据清洗告警
+SKB_LITTLE_CUTE = '79e5666f-8342-49f5-b8e1-375929248112'
+# 实时告警
+REALTIME_LITTLE_CUTE = 'cfa1f519-62d6-4f53-99a6-57ba0f1ca802'

+ 8 - 0
dw_base/common/config_constants.py

@@ -0,0 +1,8 @@
+#!/usr/bin/env /usr/bin/python3
+# -*- coding:utf-8 -*-
+
+KL_HELP = ['h', 'H', 'help']
+K_CONFIG_FILE = 'c'
+K_SOP_CONFIG_FILE = 'f'
+K_DIRECTORY = 'd'
+K_DT = 'dt'

+ 14 - 0
dw_base/common/container.py

@@ -0,0 +1,14 @@
+#!/usr/bin/env /usr/bin/python3
+# -*- coding:utf-8 -*-
+
+
+
+class ValueContainer:
+    def __init__(self, value=None):
+        self.value = value
+
+    def get(self):
+        return self.value
+
+    def set(self, value):
+        self.value = value

+ 20 - 0
dw_base/common/template_constants.py

@@ -0,0 +1,20 @@
+#!/usr/bin/env /usr/bin/python3
+# -*- coding:utf-8 -*-
+
+
+ES_MAPPING_DDL_TEMPLATE = 'conf/template/hive/es-mapping-table-ddl-template.sql'
+HIVE_ODS_DWD_FU_TEMPLATE = 'conf/template/hive/ods-dwd-full-update-template.sql'
+HIVE_RAW_ODS_V_TEMPLATE = 'conf/template/hive/raw-ods-validate-template.sql'
+HIVE_STOCK_V_TEMPLATE = 'conf/template/hive/stock-validate-template.sql'
+MYSQL_HIVE_CREATE_TABLE_TEMPLATE = 'conf/template/hive/mysql-hive-create-table-template.sql'
+MYSQL_HIVE_HBASE_CREATE_TABLE_TEMPLATE = 'conf/template/hive/mysql-hbase-create-table-template.sql'
+SPARK_NEW_RAW_ODS_V_TEMPLATE = 'conf/template/spark/new-raw-ods-validate-template.sql'
+SPARK_ODS_DWD_FU_TEMPLATE = 'conf/template/spark/ods-dwd-full-update-template.sql'
+SPARK_DWS_ES_TEMPLATE = 'conf/template/spark/dws_es_update_repair_template.sql'
+SPARK_DWS_ES_TEMPLATE_SJT = 'conf/template/spark/dws_es_update_repair_template_sjt.sql'
+SPARK_SOP_ODS_DWD_FU_TEMPLATE = 'conf/template/spark/sop-ods-dwd-full-update-template.sql'
+SPARK_RAW_ODS_V_TEMPLATE = 'conf/template/spark/raw-ods-validat e-template.sql'
+SPARK_RAW_SOP_V_TEMPLATE = 'conf/template/spark/raw-sop-template.sql'
+SPARK_STOCK_V_TEMPLATE = 'conf/template/spark/stock-validate-template.sql'
+TFC_TEMPLATE = 'conf/template/hive/tiny-file-combine-template.sql'
+SPARK_ES_QUALITY_VERIFICATION_TEMPLATE = 'conf/template/spark/es-data-quality-verification-template.sql'

+ 2 - 0
dw_base/database/__init__.py

@@ -0,0 +1,2 @@
+# -*- coding:utf-8 -*-
+

+ 184 - 0
dw_base/database/mongodb_utils.py

@@ -0,0 +1,184 @@
+#!/usr/bin/env /usr/bin/python3
+# -*- coding:utf-8 -*-
+
+from typing import List
+
+from pymongo import MongoClient
+
+
+class MongoDBHandler:
+    def __init__(self, url: str, port: int = None, username: str = None, password: str = None, check_db: str = None,
+                 extra_config: List[str] = None):
+        if not port or not username or not password or not check_db:
+            uri = url
+        else:
+            uri = f'mongodb://{username}:{password}@{url}:{port}/{check_db}'
+            if extra_config:
+                uri = f'{uri}?{"&".join(extra_config)}'
+        self.mongo_client = MongoClient(uri)
+
+
+# ES_CLIENT = Elasticsearch('es-cn-nif1oiv5w0009di0f.public.elasticsearch.aliyuncs.com:9200',
+#                               http_auth=('datafix', 'Aa123456'))
+#
+# INCLUDES = ['id', 'appInfo', 'appName', 'b2bClient', 'b2bInfo', 'b2bOpScope', 'b2bProduct', 'baikeInfo', 'blogName',
+#                 'buyingInfo', 'certL2Name', 'contactAddress', 'entName', 'historyName', 'jobName', 'licenseContent',
+#                 'licenseFileName', 'licenseOffice', 'miniAppName', 'opScope', 'patentName', 'semKeyword', 'semTitle',
+#                 'seoInfo', 'seoKeyword', 'seoTitle', 'siteName', 'softwareProductName', 'tenderInfo', 'tenderName',
+#                 'trademarkName', 'wechatName']
+#
+# def get_pid_by_name(company_name):
+#     ent_col = MONGO_CLIENT.get_database('enterprise').get_collection('EnterpriseBaseInfo')
+#     query = {'ENTNAME': company_name}
+#     projection = {'PID': 1}
+#     res_doc = ent_col.find_one(query, projection)
+#     if res_doc and 'PID' in res_doc:
+#         return res_doc['PID']
+#
+#     return None
+#
+#
+# def search_by_name(company_name):
+#     pid = get_pid_by_name(company_name)
+#     print('pid: %s' % pid)
+#     if pid:
+#         company_details = search_by_pid(pid)
+#         return company_details
+#     return None
+#
+#
+# def get_detail_from_es(index, pid, includes=['id']):
+#     try:
+#         return ES_CLIENT.get(index=index, id=pid, _source_includes=includes)
+#     except NotFoundError as not_found_e:
+#         pass
+#     except Exception as e:
+#         print('Unknown error when get detail from es. error:%s' % str(e))
+#
+#     return None
+#
+#
+# def search_by_pid(pid):
+#     return get_detail_from_es('company_info_prod', pid, INCLUDES)
+#
+#
+# def main(local_save_path):
+#     names = fp.readlines()
+#     result_docs = []
+#     for name in names:
+#         print('company name: %s' % name)
+#         detail = search_by_name(name.replace('\n', '').strip())
+#         if detail is None:
+#             print('can not found detail for name:%s' % name)
+#             continue
+#         source = detail['_source']
+#         current_doc = []
+#         # 为了保证顺序,需按INCLUDES遍历获取字段值
+#         for field in INCLUDES:
+#             if field in source:
+#                 if isinstance(source[field], str) and source[field] != '':
+#                     current_doc.append(source[field].replace('\t', ';').replace('\n', ';').replace('\r', ';'))
+#                 elif isinstance(source[field], list) and source[field] != []:
+#                     current_doc.append(
+#                         ','.join(source[field]).replace('\t', ';').replace('\n', ';').replace('\r', ';'))
+#                 else:
+#                     current_doc.append('')
+#             else:
+#                 current_doc.append('')
+#         result_docs.append(current_doc)
+#     res_df = pd.DataFrame(result_docs, mysql_column_list=INCLUDES)
+#     res_df.to_csv(local_save_path, sep='\t', index=False, encoding='utf-8', header=True)
+#     # os.system("source /etc/profile;hadoop fs -put %s %s" % (local_save_path, hdfs_save_path))
+#
+#
+# FLAGS = set('n ng nrfg nrt nt vn un'.split())
+#
+#
+# def lcut(text):
+#     if isinstance(text, list):
+#         print(text)
+#     return [
+#         w.word for sentence in split('[^a-zA-Z0-9\u4e00-\u9fa5]+', text.strip())
+#         for w in dt.cut(sentence) if len(w.word) > 2 and w.flag in FLAGS]
+#
+#
+# class TFIDF:
+#     def __init__(self, dictionary, model):
+#         self.model = model
+#         self.doc2bow = dictionary.doc2bow
+#         self.id2word = {i: w for w, i in dictionary.token2id.items()}
+#
+#     @classmethod
+#     def train(cls, texts):
+#         texts1 = [lcut(text) for text in texts]
+#         dictionary = Dictionary(texts1)
+#         corpus = [dictionary.doc2bow(text) for text in texts1]
+#         model = TfidfModel(corpus)
+#         return cls(dictionary, model)
+#
+#     def extract(self, text, top_n=10):
+#         vector = self.doc2bow(lcut(text))
+#         key_words = sorted(self.model[vector], key=lambda x: x[1], reverse=True)
+#         return [self.id2word[i] for i, j in key_words][:top_n]
+#
+#
+# def extract_keywords(mysql_column_list):
+#     columns_list = raw_data_df[mysql_column_list].values.tolist()
+#     combined = [';'.join(row) for row in columns_list]
+#     keywords = tf_idf_model.extract(';'.join(combined), top_n=50)
+#     print(','.join(keywords))
+#     return keywords
+#
+#
+# if __name__ == '__main__':
+#     fp = open(get_abs_path('data/seed_company_name_yidong.csv'))
+#     local_save_path = '/root/wwj-hive-warehouse/data/zhong_qi_customer_info_yidong.csv'
+#     hdfs_save_path = ''
+#
+#     main(local_save_path)
+#     raw_data_df = pd.read_csv(local_save_path, sep='\t', encoding='utf-8')
+#     raw_data_df = raw_data_df.fillna('')
+#     raw_data_df['combined'] = raw_data_df[INCLUDES].apply(lambda row: ';'.join(row.values.astype(str)), axis=1)
+#     all_text_info_list = raw_data_df['combined'].values.tolist()
+#     tf_idf_model = TFIDF.train(all_text_info_list)
+#
+#     op_scope_keywords = extract_keywords(['opScope', 'b2bOpScope'])
+#     product_keywords = extract_keywords(['appInfo', 'appName', 'b2bProduct'])
+#     jobName_keywords = extract_keywords(['jobName'])
+#     semKeyword_keywords = extract_keywords(['semKeyword'])
+#     baike_keywords = extract_keywords(['baikeInfo'])
+#     b2bInfo_keywords = extract_keywords(['b2bInfo'])
+#     # opScope = raw_data_df[['opScope']].values.tolist()
+#     # opScope_keywords = tf_idf_model.extract(opScope)
+#     #
+#     # opScope = raw_data_df[['opScope']].values.tolist()
+#     # opScope_keywords = tf_idf_model.extract(opScope)
+#     #
+#     # opScope = raw_data_df[['opScope']].values.tolist()
+#     # opScope_keywords = tf_idf_model.extract(opScope)
+#     # pid = get_pid_by_name('杭州德玛瑞户外用品有限公司')
+#     # print(pid)
+
+# mongo_url_a42 = 'mongodb://dw_all_ro:Dt#R30ES@' \
+#                 'dds-m5e44df0967967a41.mongodb.rds.aliyuncs.com:3717,' \
+#                 'dds-m5e44df0967967a42.mongodb.rds.aliyuncs.com:3717,' \
+#                 'dds-m5e44df0967967a43.mongodb.rds.aliyuncs.com:3717' \
+#                 '/admin?replicaSet=mgset-12596773&readReference=secondaryPreferred'
+# a42_handler = MongoDBHandler(mongo_url_a42)
+# mongo_url_b41 = 'mongodb://dw_all_rw:W#ioQseT@' \
+#                 'dds-m5ed9ea9d9a653b41.mongodb.rds.aliyuncs.com:3717,' \
+#                 'dds-m5ed9ea9d9a653b42.mongodb.rds.aliyuncs.com:3717,' \
+#                 'dds-m5ed9ea9d9a653b43.mongodb.rds.aliyuncs.com:3717' \
+#                 '/admin?replicaSet=mgset-45687639&readReference=secondaryPreferred'
+# b41_handler = MongoDBHandler(mongo_url_b41)
+# mongo_url_dev = 'mongodb://dev_dw_ro:Dt#R30ES@dds-m5e686962c7b71641431-pub.mongodb.rds.aliyuncs.com:3717/admin'
+# dev_handler = MongoDBHandler(mongo_url_dev)
+#
+# if __name__ == '__main__':
+#     mongo_handler = MongoDBHandler(url='dds-m5e44df0967967a42.mongodb.rds.aliyuncs.com',
+#                                    port=3717,
+#                                    username='dw_all_ro',
+#                                    password='Dt#R30ES',
+#                                    check_db='admin')
+#     for db in mongo_handler.mongo_client.list_database_names():
+#         print(db)

+ 185 - 0
dw_base/database/mysql_utils.py

@@ -0,0 +1,185 @@
+# -*- coding:utf-8 -*-
+
+import json
+import re
+from typing import Dict, List
+
+import pymysql
+
+
+class MySQLColumn(object):
+    def __init__(self,
+                 column_name: str,
+                 column_type: str,
+                 column_comment: str,
+                 ordinal_position: str,
+                 is_nullable: bool):
+        self.COLUMN_NAME = column_name
+        self.COLUMN_TYPE = column_type
+        self.COLUMN_COMMENT = column_comment
+        self.ORDINAL_POSITION = ordinal_position
+        self.IS_NULLABLE = is_nullable
+        self._dict = {
+            'COLUMN_NAME': column_name,
+            'COLUMN_TYPE': column_type,
+            'COLUMN_COMMENT': column_comment,
+            'ORDINAL_POSITION': ordinal_position,
+            'IS_NULLABLE': is_nullable,
+        }
+
+    def __str__(self):
+        return json.dumps(self._dict, ensure_ascii=False)
+
+
+class MySQLHandler:
+    def __init__(self, host: str, port: int, username: str, password: str, database: str = None):
+        """
+        MySQL工具包
+        Args:
+            host: 实例地址
+            port: 端口
+            username: 用户名
+            password: 密码
+        """
+        self.jdbcUrl = "jdbc:mysql://%s:%s" % (host, port)
+        self.username = username
+        self.password = password
+        self.connection = pymysql.connect(
+            host=host,
+            port=port,
+            user=username,
+            password=password,
+            database=database,
+            charset='utf8'
+        )
+        self.connection.autocommit(True)
+
+    def list_tables(self,
+                    database: str = None,
+                    exclude_regex: List[str] = None,
+                    table_regex: List[str] = None) -> Dict[str, str]:
+        """
+        列出指定数据库中的表
+        Args:
+            database: 数据库名称
+            exclude_regex: 不要的数据表正则
+            table_regex: 想要的数据表正则
+        Returns: 表及注释
+        """
+        assert database is not None
+        curs = self.connection.cursor()
+        curs.execute('SET NAMES utf8')
+        curs.execute(f'use {database}')
+        sql = "SELECT TABLE_NAME, TABLE_COMMENT " \
+              "  FROM information_schema.TABLES " \
+              f" WHERE TABLE_SCHEMA='{database}' AND TABLE_TYPE = 'BASE TABLE'"
+        curs.execute(sql)
+        rows = curs.fetchall()
+        tables = {}
+        for each_row in rows:
+            if exclude_regex:
+                exclude = False
+                for regex in exclude_regex:
+                    if re.match(regex, each_row[0]):
+                        exclude = True
+                        break
+                if exclude:
+                    continue
+            if table_regex:
+                match = False
+                for regex in table_regex:
+                    if re.match(regex, each_row[0]):
+                        match = True
+                        break
+                if not match:
+                    continue
+            tables[each_row[0]] = each_row[1]
+        return tables
+
+    def list_columns(self, database: str, table_name: str) -> List[MySQLColumn]:
+        """
+        列出指定数据库、指定表的字段及字段的其他信息
+        Args:
+            database: 数据库
+            table_name: 表
+        Returns: 字段及字段的其他信息
+        """
+        assert database is not None
+        assert table_name is not None
+        curs = self.connection.cursor()
+        curs.execute('SET NAMES utf8')
+        curs.execute(f'use {database}')
+        detail_names = ['COLUMN_TYPE', 'COLUMN_COMMENT', 'ORDINAL_POSITION', 'IS_NULLABLE']
+        sql = "SELECT COLUMN_NAME, %s" \
+              "  FROM information_schema.COLUMNS " \
+              " WHERE TABLE_SCHEMA = '%s' AND TABLE_NAME = '%s'"
+        sql = sql % (', '.join(detail_names), database, table_name)
+        curs.execute(sql)
+        rows = curs.fetchall()
+        columns = []
+        for each_row in rows:
+            column_name = each_row[0]
+            column_type = each_row[1]
+            column_comment = each_row[2]
+            ordinal_position = each_row[3]
+            is_nullable = each_row[4]
+            mysql_column = MySQLColumn(column_name, column_type, column_comment, ordinal_position, is_nullable)
+            columns.append(mysql_column)
+        return columns
+
+    def query(self, sql: str):
+        curs = self.connection.cursor()
+        curs.execute('SET NAMES utf8')
+        curs.execute(sql)
+        rows = curs.fetchall()
+        return rows
+
+    def query_column_hive_metadata(self, table_name: str):
+        curs = self.connection.cursor()
+        curs.execute('SET NAMES utf8')
+        sql = f'SELECT' \
+              f'	t.TBL_NAME,' \
+              f'	c.COLUMN_NAME,' \
+              f'	c.TYPE_NAME,' \
+              f'	c.`COMMENT` ' \
+              f'FROM ' \
+              f' hive.TBLS t' \
+              f' LEFT JOIN hive.SDS s ON t.SD_ID = s.SD_ID' \
+              f' LEFT JOIN hive.COLUMNS_V2 c ON s.CD_ID = c.CD_ID' \
+              f' LEFT JOIN hive.TBLS tbs ON s.SD_ID = tbs.SD_ID ' \
+              f'WHERE t.TBL_NAME = "{table_name}"'
+        curs.execute(sql)
+        column_info = curs.fetchall()
+        return column_info
+
+    def query_tbl_hive_metadata(self, table_name: str):
+        curs = self.connection.cursor()
+        curs.execute('SET NAMES utf8')
+        sql = f'SELECT' \
+              f'	tp.PARAM_KEY,' \
+              f'	tp.PARAM_VALUE ' \
+              f'FROM' \
+              f'	hive.TABLE_PARAMS tp' \
+              f'	LEFT JOIN hive.TBLS t ON tp.TBL_ID = t.TBL_ID ' \
+              f'WHERE' \
+              f'	t.TBL_NAME = "{table_name}"'
+        curs.execute(sql)
+        column_info = curs.fetchall()
+        return column_info
+
+
+if __name__ == '__main__':
+    mysql_handler = MySQLHandler(
+        'rm-m5e76y41wq677ogz7.mysql.rds.aliyuncs.com',
+        3306,
+        'bigdata_sync',
+        '76iW6SG2K6RGN2X68EQb'
+    )
+    database_ame = 'ik_bms_production'
+    tables = mysql_handler.list_tables(database_ame)
+    for table_name, table_comment in tables.items():
+        print(f'{table_name}\t{table_comment}')
+        columns = mysql_handler.list_columns(database_ame, table_name)
+        for col in columns:
+            print(col)
+        break

+ 0 - 0
dw_base/datax/__init__.py


+ 0 - 0
dw_base/datax/datasources/__init__.py


+ 16 - 0
dw_base/datax/datasources/clickhouse_data_source.py

@@ -0,0 +1,16 @@
+# -*- coding:utf-8 -*-
+
+
+from dw_base.datax.datasources.data_source import DataSource
+from dw_base.datax.datax_constants import DS_CLICK_HOUSE_JDBC_URL
+
+# ClickHouse Data Source
+DS_TYPE_CLICK_HOUSE = 'clickhouse'
+DS_CLICK_HOUSE_KEYS = [DS_CLICK_HOUSE_JDBC_URL, 'username', 'password']
+
+
+class ClickHouseDataSource(DataSource):
+    def __init__(self, ds_file: str):
+        super(ClickHouseDataSource, self).__init__(ds_file)
+        self.source_type = DS_TYPE_CLICK_HOUSE
+        self.keys = DS_CLICK_HOUSE_KEYS

+ 27 - 0
dw_base/datax/datasources/data_source.py

@@ -0,0 +1,27 @@
+# -*- coding:utf-8 -*-
+
+from configparser import ConfigParser
+from typing import Dict
+
+
+class DataSource(object):
+    def __init__(self, ds_file: str):
+        self.ds_file = ds_file
+        self.config_parser = ConfigParser()
+        self.config_parser.read(self.ds_file)
+        self.ds_dict = {}
+        self.source_type = None
+        self.keys = []
+
+    def get_datasource_dict(self) -> Dict[str, str]:
+        for key in self.keys:
+            try:
+                self.ds_dict[key] = self.config_parser.get('base', key)
+            except KeyError:
+                raise KeyError('%s must specified at %s data source config.' % (key, self.source_type))
+        return self.ds_dict
+
+    def parse(self):
+        if self.source_type is None:
+            raise NotImplementedError('please use a specified class of data source.')
+        return self.get_datasource_dict()

+ 35 - 0
dw_base/datax/datasources/data_source_factory.py

@@ -0,0 +1,35 @@
+# -*- coding:utf-8 -*-
+
+
+from dw_base.datax.datasources.data_source import DataSource
+from dw_base.datax.datasources.elasticseach_data_source import DS_TYPE_ELASTICSEARCH, ElasticsearchDataSource
+from dw_base.datax.datasources.hbase_data_source import DS_TYPE_HBASE, HBaseDataSource
+from dw_base.datax.datasources.hdfs_data_source import DS_TYPE_HDFS, HDFSDataSource
+from dw_base.datax.datasources.kafka_data_source import KafkaDataSource, DS_TYPE_KAFKA
+from dw_base.datax.datasources.mongo_data_source import DS_TYPE_MONGO, MongoDataSource
+from dw_base.datax.datasources.mysql_data_source import DS_TYPE_MYSQL, MySQLDataSource
+from dw_base.datax.datasources.postgresql_data_source import DS_TYPE_POSTGRE_SQL, PostgreSQLDataSource
+from dw_base.datax.datasources.clickhouse_data_source import DS_TYPE_CLICK_HOUSE, ClickHouseDataSource
+
+
+class DataSourceFactory:
+    @staticmethod
+    def get_data_source(ds_type: str, ds_file_path: str) -> DataSource:
+        if ds_type == DS_TYPE_ELASTICSEARCH:
+            return ElasticsearchDataSource(ds_file_path)
+        elif ds_type == DS_TYPE_HBASE:
+            return HBaseDataSource(ds_file_path)
+        elif ds_type == DS_TYPE_HDFS:
+            return HDFSDataSource(ds_file_path)
+        elif ds_type == DS_TYPE_KAFKA:
+            return KafkaDataSource(ds_file_path)
+        elif ds_type == DS_TYPE_MONGO:
+            return MongoDataSource(ds_file_path)
+        elif ds_type == DS_TYPE_MYSQL:
+            return MySQLDataSource(ds_file_path)
+        elif ds_type == DS_TYPE_POSTGRE_SQL:
+            return PostgreSQLDataSource(ds_file_path)
+        elif ds_type == DS_TYPE_CLICK_HOUSE:
+            return ClickHouseDataSource(ds_file_path)
+        else:
+            raise ValueError('DataSource type %s defined in %s is not supported yet' % (ds_type, ds_file_path))

+ 21 - 0
dw_base/datax/datasources/elasticseach_data_source.py

@@ -0,0 +1,21 @@
+# -*- coding:utf-8 -*-
+
+
+from dw_base.datax.datasources.data_source import DataSource
+
+# Elasticsearch Data Source
+DS_TYPE_ELASTICSEARCH = 'elasticsearch'
+DS_ELASTICSEARCH_KEYS = [
+    'nodes',
+    'port',
+    'user',
+    'password'
+]
+
+
+class ElasticsearchDataSource(DataSource):
+
+    def __init__(self, ds_file: str):
+        super(ElasticsearchDataSource, self).__init__(ds_file)
+        self.source_type = DS_TYPE_ELASTICSEARCH
+        self.keys = DS_ELASTICSEARCH_KEYS

+ 30 - 0
dw_base/datax/datasources/hbase_data_source.py

@@ -0,0 +1,30 @@
+# -*- coding:utf-8 -*-
+
+
+from dw_base.datax.datasources.data_source import DataSource
+
+# HBase Data Source
+DS_TYPE_HBASE = 'hbase'
+DS_HBASE_KEYS = [
+    'kerberos.enabled',
+    'kerberos.realm',
+    'kerberos.kdc',
+    'kerberos.krb5Conf',
+    'zookeeper.quorum',
+    'zookeeper.port'
+]
+
+
+class HBaseDataSource(DataSource):
+    def __init__(self, ds_file: str):
+        super(HBaseDataSource, self).__init__(ds_file)
+        self.source_type = DS_TYPE_HBASE
+        self.keys = DS_HBASE_KEYS
+
+    @staticmethod
+    def generate_definition(conf: str) -> str:
+        lines = [
+            '[base]',
+            'conf = %s' % conf
+        ]
+        return '\n'.join(lines)

+ 23 - 0
dw_base/datax/datasources/hdfs_data_source.py

@@ -0,0 +1,23 @@
+# -*- coding:utf-8 -*-
+
+
+from dw_base.datax.datasources.data_source import DataSource
+
+# HDFS Data Source
+DS_TYPE_HDFS = 'hdfs'
+DS_HDFS_KEYS = ['defaultFS']
+
+
+class HDFSDataSource(DataSource):
+    def __init__(self, ds_file: str):
+        super(HDFSDataSource, self).__init__(ds_file)
+        self.source_type = DS_TYPE_HDFS
+        self.keys = DS_HDFS_KEYS
+
+    @staticmethod
+    def generate_definition(default_fs: str) -> str:
+        lines = [
+            '[base]',
+            'defaultFS = %s' % default_fs
+        ]
+        return '\n'.join(lines)

+ 23 - 0
dw_base/datax/datasources/kafka_data_source.py

@@ -0,0 +1,23 @@
+# -*- coding:utf-8 -*-
+
+from dw_base.datax.datasources.data_source import DataSource
+
+DS_TYPE_KAFKA = 'kafka'
+DS_KAFKA_KEYS = [
+    'brokers',
+]
+
+
+class KafkaDataSource(DataSource):
+    def __init__(self, ds_file: str):
+        super(KafkaDataSource, self).__init__(ds_file)
+        self.source_type = DS_TYPE_KAFKA
+        self.keys = DS_KAFKA_KEYS
+
+    @staticmethod
+    def generate_definition(conf: str) -> str:
+        lines = [
+            '[base]',
+            'brokers = %s' % conf
+        ]
+        return '\n'.join(lines)

+ 30 - 0
dw_base/datax/datasources/mongo_data_source.py

@@ -0,0 +1,30 @@
+# -*- coding:utf-8 -*-
+
+
+from dw_base.datax.datasources.data_source import DataSource
+
+# Mongo DataSource
+DS_TYPE_MONGO = 'mongo'
+DS_MONGO_KEYS = ['address']
+
+
+class MongoDataSource(DataSource):
+    def __init__(self, ds_file: str):
+        super(MongoDataSource, self).__init__(ds_file)
+        self.source_type = DS_TYPE_MONGO
+        self.keys = DS_MONGO_KEYS
+
+    def get_datasource_dict(self):
+        for key in self.keys:
+            try:
+                value = self.config_parser.get('base', key)
+            except KeyError:
+                raise KeyError('%s must specified at %s data source config.' % (key, self.source_type))
+            if value:
+                if key == 'address':
+                    self.ds_dict[key] = [value]
+                else:
+                    self.ds_dict[key] = value
+            else:
+                raise KeyError('%s must specified at %s data source config.' % (key, self.source_type))
+        return self.ds_dict

+ 26 - 0
dw_base/datax/datasources/mysql_data_source.py

@@ -0,0 +1,26 @@
+# -*- coding:utf-8 -*-
+
+
+from dw_base.datax.datasources.data_source import DataSource
+from dw_base.datax.datax_constants import DS_MYSQL_JDBC_URL
+
+# MySQL Data Source
+DS_TYPE_MYSQL = 'mysql'
+DS_MYSQL_KEYS = [DS_MYSQL_JDBC_URL, 'username', 'password']
+
+
+class MySQLDataSource(DataSource):
+    def __init__(self, ds_file: str):
+        super(MySQLDataSource, self).__init__(ds_file)
+        self.source_type = DS_TYPE_MYSQL
+        self.keys = DS_MYSQL_KEYS
+
+    @staticmethod
+    def generate_definition(host: str, port: int, username: str, password: str, database: str) -> str:
+        lines = [
+            '[base]',
+            'jdbcUrl=jdbc:mysql://%s:%s/%s' % (host, str(port), database),
+            'username=%s' % username,
+            'password=%s' % password
+        ]
+        return '\n'.join(lines)

+ 16 - 0
dw_base/datax/datasources/postgresql_data_source.py

@@ -0,0 +1,16 @@
+# -*- coding:utf-8 -*-
+
+
+from dw_base.datax.datasources.data_source import DataSource
+from dw_base.datax.datax_constants import DS_POSTGRE_SQL_JDBC_URL
+
+# PostgreSQL Data Source
+DS_TYPE_POSTGRE_SQL = 'postgresql'
+DS_POSTGRE_SQL_KEYS = [DS_POSTGRE_SQL_JDBC_URL, 'username', 'password']
+
+
+class PostgreSQLDataSource(DataSource):
+    def __init__(self, ds_file: str):
+        super(PostgreSQLDataSource, self).__init__(ds_file)
+        self.source_type = DS_TYPE_POSTGRE_SQL
+        self.keys = DS_POSTGRE_SQL_KEYS

+ 43 - 0
dw_base/datax/datax_constants.py

@@ -0,0 +1,43 @@
+# -*- coding:utf-8 -*-
+
+# 全量同步标记
+ALL_DATA_DATE = '19700101'
+# column_type
+COLUMN_TYPE_DATE = 'date'
+COLUMN_TYPE_DOUBLE = 'double'
+COLUMN_TYPE_INT = 'int'
+COLUMN_TYPE_LONG = 'long'
+COLUMN_TYPE_STRING = 'string'
+DS_MYSQL_JDBC_URL = 'jdbcUrl'
+DS_POSTGRE_SQL_JDBC_URL = 'jdbcUrl'
+DS_CLICK_HOUSE_JDBC_URL = 'jdbcUrl'
+# DataX作业配置生成器配置
+GEN_CONFIG_KEY_DATA_SOURCE = 'dataSource'
+GEN_CONFIG_KEY_COLUMN_TYPE = 'columnType'
+GEN_CONFIG_KEY_COLUMN_FORMAT = 'columnFormat'
+GEN_CONFIG_KEY_COLUMN_SPLITTER_ITEM_TYPE = 'splitterItemType'
+# for job.setting.speed
+JOB_SETTING_SPEED_BYTE = 'byte'
+JOB_SETTING_SPEED_CHANNEL = 'channel'
+JOB_SETTING_SPEED_RECORD = 'record'
+# for job.content[n].{reader/writer}.name
+PLUGIN_NAME = 'name'
+# for job.content[n].{reader/writer}.parameter
+PLUGIN_PARAMETER = 'parameter'
+# for job.content[n].{reader/writer}.column
+PLUGIN_PARAMETER_COLUMN = 'column'
+# for job.content[n].{reader}.addColumn
+PLUGIN_PARAMETER_ADD_COLUMN = 'addColumn'
+# for job.content[n].{reader/writer}.column[n].{index/type/name/format/splitter/itemtype}
+PLUGIN_PARAMETER_COLUMN_N_FORMAT = 'format'
+PLUGIN_PARAMETER_COLUMN_N_INDEX = 'index'
+PLUGIN_PARAMETER_COLUMN_N_ITEM_TYPE = "itemtype"
+PLUGIN_PARAMETER_COLUMN_N_NAME = 'name'
+PLUGIN_PARAMETER_COLUMN_N_SPLITTER = "splitter"
+PLUGIN_PARAMETER_COLUMN_N_TYPE = 'type'
+# for job.content[n].{reader/writer}.connection
+PLUGIN_PARAMETER_CONNECTION = 'connection'
+# for value of job.content[n].reader
+JOB_CONTENT_N_READER = 'reader'
+# for value of job.content[n].writer
+JOB_CONTENT_N_WRITER = 'writer'

+ 40 - 0
dw_base/datax/datax_utils.py

@@ -0,0 +1,40 @@
+# -*- coding:utf-8 -*-
+
+from typing import Dict, List
+
+from dw_base import NORM_GRN, NORM_YEL
+from dw_base.database.mysql_utils import MySQLColumn
+from dw_base.utils.log_utils import pretty_print
+
+
+def convert_mysql_column_types(columns: List[MySQLColumn]) -> Dict[str, str]:
+    column_types = {}
+    bool_types = ['bool', 'bit']
+    double_types = ['float', 'double']
+    string_types = ['text', 'longtext', 'mediumtext', 'time']
+    timestamp_types = ['datetime', 'timestamp']
+    for mysql_column in columns:
+        column_name = mysql_column.COLUMN_NAME
+        origin_type = mysql_column.COLUMN_TYPE
+        datax_type = None
+        if origin_type.startswith('bigint'):
+            datax_type = 'bigint'
+        elif origin_type.__contains__('int'):
+            datax_type = 'int'
+        elif timestamp_types.__contains__(origin_type):
+            datax_type = 'timestamp'
+        elif origin_type == 'date':
+            datax_type = 'date'
+        elif bool_types.__contains__(origin_type):
+            datax_type = 'boolean'
+        elif double_types.__contains__(origin_type) or origin_type.startswith('decimal'):
+            datax_type = 'double'
+        elif string_types.__contains__(origin_type) or origin_type.startswith('varchar') or origin_type.startswith(
+                'char') or origin_type.startswith('enum'):
+            # string不用管
+            pass
+        else:
+            pretty_print(f'{NORM_YEL}遇到了未处理MySQL——DataX类型映射的MySQL类型:{NORM_GRN}{origin_type}')
+        if datax_type is not None:
+            column_types[column_name] = datax_type
+    return column_types

+ 89 - 0
dw_base/datax/job_config_generator.py

@@ -0,0 +1,89 @@
+# -*- coding:utf-8 -*-
+
+import json
+from configparser import ConfigParser
+
+from dw_base.datax.datax_constants import *
+from dw_base.datax.plugins.plugin_factory import PluginFactory
+from dw_base.utils import datetime_utils
+from dw_base.utils.file_utils import delete_file, get_abs_path
+
+
+class JobConfigGenerator(object):
+    """
+    生成DataX作业配置文件
+    """
+
+    def __init__(self, base_dir: str, generator_config: str, start_date: str, stop_date: str, output: str):
+        """
+        初始化
+        Args:
+            base_dir: 项目目录
+            generator_config: DataX作业配置生成器配置文件,格式为ini文件,包含两部分reader和writer,两部分都包含dataSource配置
+            start_date: 内参,开始日期
+            stop_date: 内参,结束日期
+            output: 结果(DataX作业配置文件)输出文件路径
+        """
+        self.generator_config = get_abs_path(generator_config)
+        self.base_dir = base_dir
+        self.start_date = start_date
+        self.stop_date = stop_date
+        self.output = output
+        self.config_parser = ConfigParser()
+        self.config_parser.read(self.generator_config)
+
+    def get_reader(self):
+        reader = PluginFactory.get_plugin('reader', self.base_dir, self.config_parser, self.start_date, self.stop_date)
+        return reader.configure()
+
+    def get_writer(self):
+        writer = PluginFactory.get_plugin('writer', self.base_dir, self.config_parser, self.start_date, self.stop_date)
+        return writer.configure()
+
+    @staticmethod
+    def get_speed(channel=6, byte=268435456, record=100000):
+        return {JOB_SETTING_SPEED_CHANNEL: channel, JOB_SETTING_SPEED_BYTE: byte, JOB_SETTING_SPEED_RECORD: record}
+
+    @staticmethod
+    def get_core_speed(byte=268435456, record=100000):
+        return {
+            'transport': {
+                'channel': {
+                    'speed': {
+                        'byte': byte,
+                        'record': record
+                    }
+                }
+            }
+        }
+
+    def assemble(self):
+        local_time = int(datetime_utils.formatted_now('%H%M'))
+        if 750 < local_time < 1900:
+            speed = self.get_speed(10, byte=10485760, record=40000)
+            core_speed = self.get_core_speed(byte=10485760, record=40000)
+        else:
+            speed = self.get_speed()
+            core_speed = self.get_core_speed()
+        job_config_json = {
+            'job': {
+                'content': [
+                    {
+                        'reader': self.get_reader(),
+                        'writer': self.get_writer()
+                    }
+                ],
+                'setting': {
+                    'speed': speed
+                }
+            },
+            'core': core_speed
+        }
+        return job_config_json
+
+    def run(self):
+        job_config_json = self.assemble()
+        # HDFS Mount的覆盖写入貌似有问题
+        delete_file(self.output)
+        with open(self.output, 'w') as w:
+            json.dump(job_config_json, w, ensure_ascii=False)

+ 0 - 0
dw_base/datax/plugins/__init__.py


+ 168 - 0
dw_base/datax/plugins/plugin.py

@@ -0,0 +1,168 @@
+# -*- coding:utf-8 -*-
+
+import os
+import pwd
+import re
+from configparser import ConfigParser
+from datetime import datetime
+from typing import Dict
+
+from dw_base.datax.datasources.data_source import DataSource
+from dw_base.datax.datasources.data_source_factory import DataSourceFactory
+from dw_base.datax.datax_constants import *
+
+
+class Plugin(object):
+
+    def __init__(self, base_dir: str, config_parser: ConfigParser, start_date: str = None, stop_date: str = None):
+        self.base_dir = base_dir
+        self.config_parser = config_parser
+        self.start_date = str(start_date)
+        self.stop_date = str(stop_date)
+        self.parameter = {}
+        self.config = {}
+        self.ds_file = None
+        self.ds_type = None
+        self.ds_file_path = None
+        self.datasource = None  # type: DataSource
+        self.plugin_type = None
+        self.plugin_name = None
+        self.columns = []
+        self.add_columns = []
+
+    def init(self):
+        self.check()
+        self.ds_file = self.config_parser.get(self.plugin_type, GEN_CONFIG_KEY_DATA_SOURCE)
+        self.ds_type = self.ds_file.split('/')[-1].split('-')[0]
+
+        self.ds_file_path = f'{self.base_dir}/../datasource/{self.ds_type}/{self.ds_file}.ini'
+
+        if not os.path.exists(self.ds_file_path) or not os.path.isfile(self.ds_file_path):
+            raise FileNotFoundError(self.ds_file_path)
+        self.datasource = DataSourceFactory.get_data_source(self.ds_type, self.ds_file_path)
+
+    def check(self):
+        if not self.plugin_name:
+            raise ValueError('Plugin.plugin_name at every implemented class of Plugin should be specified.')
+        if not self.plugin_type:
+            raise ValueError('Plugin.plugin_type at every type of Plugin should be specified.')
+        if not (self.start_date == ALL_DATA_DATE or re.match(r'\d{8}', self.start_date)):
+            raise ValueError('start_date %s must be format of yyyyMMdd or equal %s ' % (self.start_date, ALL_DATA_DATE))
+        if not (self.stop_date == ALL_DATA_DATE or re.match(r'\d{8}', self.stop_date)):
+            raise ValueError('stop_date %s must be format of yyyyMMdd or equals %s ' % (self.stop_date, ALL_DATA_DATE))
+
+    def check_config(self, key, value):
+        if not value:
+            raise ValueError('config %s of %s %s not provided' % (key, self.plugin_type, self.plugin_name))
+
+    def load_data_source(self):
+        ds_dict: Dict[str, str] = self.datasource.parse()
+        for key, value in ds_dict.items():
+            self.parameter[key] = value
+
+    def load_column(self):
+        columns_info = []
+        self.columns = [
+            col.strip() for col in self.config_parser.get(self.plugin_type, PLUGIN_PARAMETER_COLUMN).split(',')
+        ]
+        column_type = self.get_column_type()
+        column_format = self.get_column_format()
+        splitter_item_type = self.get_splitter_item_type()
+        if isinstance(self.columns, list):
+            if isinstance(self.columns[0], str):
+                for i, column in enumerate(self.columns):
+                    column_info = {
+                        PLUGIN_PARAMETER_COLUMN_N_INDEX: i,
+                        PLUGIN_NAME: column,
+                        PLUGIN_PARAMETER_COLUMN_N_TYPE: column_type.get(column, COLUMN_TYPE_STRING)
+                    }
+                    if column in column_format.keys():
+                        column_info[PLUGIN_PARAMETER_COLUMN_N_FORMAT] = column_format[column]
+                    if column in splitter_item_type.keys():
+                        splitter, item_type = splitter_item_type[column].split('_')
+                        column_info[PLUGIN_PARAMETER_COLUMN_N_SPLITTER] = splitter
+                        column_info[PLUGIN_PARAMETER_COLUMN_N_ITEM_TYPE] = item_type
+                    columns_info.append(column_info)
+            elif isinstance(self.columns[0], dict):
+                # data works中的数据集成
+                columns_info = self.columns
+            else:
+                raise TypeError('column element type error. ')
+        else:
+            raise Exception('columns mast be a list.')
+
+        # 新增ini文件 addColumns处理逻辑(目前仅支持hdfsReader、其他未测试)
+        if (self.plugin_type == JOB_CONTENT_N_READER
+                and self.config_parser.has_option(JOB_CONTENT_N_READER, PLUGIN_PARAMETER_ADD_COLUMN)):
+            self.add_columns = [
+                (col.split(":")[0],col.split(":")[1]) for col in self.config_parser.get(self.plugin_type, PLUGIN_PARAMETER_ADD_COLUMN).split(',')
+            ]
+        for column in self.add_columns:
+            if column[0] == "batch_id":
+                batch_id = datetime.now().strftime('%Y%m%d%H%M')
+                columns_info_add = {"value": batch_id,"type":column[1]}
+                columns_info.append(columns_info_add)
+                # ds捕获参数batch_id(方便后端批处理刷新es)
+                print('${setValue(batch_id=%s)}' % batch_id)
+                # write add
+                col_value = self.config_parser.get(JOB_CONTENT_N_WRITER, PLUGIN_PARAMETER_COLUMN)
+                col_type_value = self.config_parser.get(JOB_CONTENT_N_WRITER, GEN_CONFIG_KEY_COLUMN_TYPE)
+                self.config_parser.set(
+                    JOB_CONTENT_N_WRITER, PLUGIN_PARAMETER_COLUMN,
+                    value=f"{col_value},{column[0]}"
+                )
+                self.config_parser.set(
+                    JOB_CONTENT_N_WRITER, GEN_CONFIG_KEY_COLUMN_TYPE,
+                    value=f"{col_type_value},{column[0]}:{column[1]}"
+                )
+        self.parameter[PLUGIN_PARAMETER_COLUMN] = columns_info
+
+    def get_column_format(self):
+        column_format = {}
+        if self.config_parser.has_option(self.plugin_type, GEN_CONFIG_KEY_COLUMN_FORMAT):
+            c_format = self.config_parser.get(self.plugin_type, GEN_CONFIG_KEY_COLUMN_FORMAT)
+            if c_format:
+                for item in c_format.split(','):
+                    k, v = item.split('##')
+                    column_format[k] = v
+        return column_format
+
+    def get_splitter_item_type(self):
+        splitter_item_type = {}
+        if self.config_parser.has_option(self.plugin_type, GEN_CONFIG_KEY_COLUMN_SPLITTER_ITEM_TYPE):
+            splitter_item = self.config_parser.get(self.plugin_type, GEN_CONFIG_KEY_COLUMN_SPLITTER_ITEM_TYPE)
+            if splitter_item:
+                for item in splitter_item.split(','):
+                    k, v = item.split(':')
+                    splitter_item_type[k] = v
+        return splitter_item_type
+
+    def get_column_type(self):
+        column_type = {}
+        try:
+            c_types = self.config_parser.get(self.plugin_type, GEN_CONFIG_KEY_COLUMN_TYPE)
+            if c_types:
+                for item in c_types.split(','):
+                    k, v = item.split(':')
+                    column_type[k] = v
+        except Exception as e:
+            raise e
+        return column_type
+
+    def load_parameter(self):
+        self.load_data_source()
+        self.load_column()
+        self.load_others()
+
+    def assemble_plugin(self):
+        self.load_parameter()
+        self.config[PLUGIN_NAME] = self.plugin_name
+        self.config[PLUGIN_PARAMETER] = self.parameter
+
+    def configure(self):
+        self.init()
+        self.assemble_plugin()
+        return self.config
+
+    def load_others(self):
+        raise NotImplementedError('please implement this method in a specified writer.')

+ 70 - 0
dw_base/datax/plugins/plugin_factory.py

@@ -0,0 +1,70 @@
+# -*- coding:utf-8 -*-
+from dw_base.datax.datasources.clickhouse_data_source import DS_TYPE_CLICK_HOUSE
+from dw_base.datax.datasources.data_source import *
+from dw_base.datax.datasources.elasticseach_data_source import DS_TYPE_ELASTICSEARCH
+from dw_base.datax.datasources.hbase_data_source import DS_TYPE_HBASE
+from dw_base.datax.datasources.hdfs_data_source import DS_TYPE_HDFS
+from dw_base.datax.datasources.kafka_data_source import DS_TYPE_KAFKA
+from dw_base.datax.datasources.mongo_data_source import DS_TYPE_MONGO
+from dw_base.datax.datasources.mysql_data_source import DS_TYPE_MYSQL
+from dw_base.datax.datasources.postgresql_data_source import DS_TYPE_POSTGRE_SQL
+from dw_base.datax.datax_constants import *
+from dw_base.datax.plugins.plugin import Plugin
+from dw_base.datax.plugins.reader.clickhouse_reader import ClickHouseReader
+from dw_base.datax.plugins.reader.hdfs_reader import HDFSReader
+from dw_base.datax.plugins.reader.mongo_reader import MongoReader
+from dw_base.datax.plugins.reader.mysql_reader import MySQLReader
+from dw_base.datax.plugins.reader.postgresql_reader import PostgreSQLReader
+from dw_base.datax.plugins.writer.clickhouse_writer import ClickHouseWriter
+from dw_base.datax.plugins.writer.elasticsearch_writer import ElasticsearchWriter
+from dw_base.datax.plugins.writer.hbase_writer import HBaseWriter
+from dw_base.datax.plugins.writer.hdfs_writer import HDFSWriter
+from dw_base.datax.plugins.writer.kafka_writer import KafkaWriter
+from dw_base.datax.plugins.writer.mongo_writer import MongoWriter
+from dw_base.datax.plugins.writer.mysql_writer import MySQLWriter
+from dw_base.datax.plugins.writer.postgresql_writer import PostgreSQLWriter
+
+
+class PluginFactory:
+    @staticmethod
+    def get_plugin(plugin_type: str, base_dir: str, config_parser: ConfigParser, start_time: str,
+                   stop_time: str) -> Plugin:
+        ds_file = config_parser.get(plugin_type, GEN_CONFIG_KEY_DATA_SOURCE)
+        ds_type = ds_file.split('/')[-1].split('-')[0]
+        if plugin_type == JOB_CONTENT_N_READER:
+            if ds_type == DS_TYPE_HDFS:
+                plugin = HDFSReader(base_dir, config_parser, start_time, stop_time)
+            elif ds_type == DS_TYPE_MONGO:
+                plugin = MongoReader(base_dir, config_parser, start_time, stop_time)
+            elif ds_type == DS_TYPE_MYSQL:
+                plugin = MySQLReader(base_dir, config_parser, start_time, stop_time)
+            elif ds_type == DS_TYPE_POSTGRE_SQL:
+                plugin = PostgreSQLReader(base_dir, config_parser, start_time, stop_time)
+            elif ds_type == DS_TYPE_CLICK_HOUSE:
+                plugin = ClickHouseReader(base_dir, config_parser, start_time, stop_time)
+            else:
+                raise ValueError('DataSource type %s of reader defined in %s is not supported yet' % (ds_type, ds_file))
+        elif plugin_type == JOB_CONTENT_N_WRITER:
+            if ds_type == DS_TYPE_ELASTICSEARCH:
+                plugin = ElasticsearchWriter(base_dir, config_parser, start_time, stop_time)
+            elif ds_type == DS_TYPE_HBASE:
+                plugin = HBaseWriter(base_dir, config_parser, start_time, stop_time)
+            elif ds_type == DS_TYPE_HDFS:
+                plugin = HDFSWriter(base_dir, config_parser, start_time, stop_time)
+            elif ds_type == DS_TYPE_KAFKA:
+                plugin = KafkaWriter(base_dir, config_parser, start_time, stop_time)
+            elif ds_type == DS_TYPE_MONGO:
+                plugin = MongoWriter(base_dir, config_parser, start_time, stop_time)
+            elif ds_type == DS_TYPE_MYSQL:
+                plugin = MySQLWriter(base_dir, config_parser, start_time, stop_time)
+            elif ds_type == DS_TYPE_POSTGRE_SQL:
+                plugin = PostgreSQLWriter(base_dir, config_parser, start_time, stop_time)
+            elif ds_type == DS_TYPE_ELASTICSEARCH:
+                plugin = ElasticsearchWriter(base_dir, config_parser, start_time, stop_time)
+            elif ds_type == DS_TYPE_CLICK_HOUSE:
+                plugin = ClickHouseWriter(base_dir, config_parser, start_time, stop_time)
+            else:
+                raise ValueError('DataSource type %s of writer defined in %s is not supported yet' % (ds_type, ds_file))
+        else:
+            raise ValueError('Unsupported plugin type %s' % plugin_type)
+        return plugin

+ 2 - 0
dw_base/datax/plugins/reader/__init__.py

@@ -0,0 +1,2 @@
+# -*- coding:utf-8 -*-
+

+ 74 - 0
dw_base/datax/plugins/reader/clickhouse_reader.py

@@ -0,0 +1,74 @@
+# -*- coding:utf-8 -*-
+
+import re
+from configparser import ConfigParser
+
+from dw_base.datax.datax_constants import *
+from dw_base.datax.plugins.reader.reader import Reader
+
+# ClickHouse reader
+
+CLICK_HOUSE_READER_NAME = 'clickhousereader'
+CLICK_HOUSE_READER_PARAMETER_CONNECTION = 'connection'
+CLICK_HOUSE_READER_PARAMETER_DATABASE = 'database'
+CLICK_HOUSE_READER_PARAMETER_FETCH_SIZE = 'fetchSize'
+CLICK_HOUSE_READER_PARAMETER_QUERY_SQL = 'querySql'
+CLICK_HOUSE_READER_PARAMETER_TABLE = 'table'
+CLICK_HOUSE_READER_PARAMETER_COLUMN = 'column'
+CLICK_HOUSE_READER_PARAMETER_WHERE = 'where'
+CLICK_HOUSE_READER_PARAMETER_SPLIT_PK = 'splitPk'
+
+
+class ClickHouseReader(Reader):
+
+    def __init__(self, base_dir: str, config_parser: ConfigParser, start_date: str = None, stop_date: str = None):
+        super(ClickHouseReader, self).__init__(base_dir, config_parser, start_date, stop_date)
+        self.plugin_name = CLICK_HOUSE_READER_NAME
+
+    def load_others(self):
+        start_date = self.start_date
+        stop_date = self.stop_date
+        database = self.config_parser.get(self.plugin_type, CLICK_HOUSE_READER_PARAMETER_DATABASE)
+        self.check_config(CLICK_HOUSE_READER_PARAMETER_DATABASE, database)
+        table = self.config_parser.get(self.plugin_type, CLICK_HOUSE_READER_PARAMETER_TABLE)
+        self.check_config(CLICK_HOUSE_READER_PARAMETER_TABLE, table)
+        fetch_size = self.config_parser.get(self.plugin_type, CLICK_HOUSE_READER_PARAMETER_FETCH_SIZE) or '1000'
+        self.parameter[CLICK_HOUSE_READER_PARAMETER_FETCH_SIZE] = fetch_size
+        split_pk = self.config_parser.get(self.plugin_type, CLICK_HOUSE_READER_PARAMETER_SPLIT_PK)
+        self.parameter[CLICK_HOUSE_READER_PARAMETER_SPLIT_PK] = split_pk
+        where = self.config_parser.get(self.plugin_type, CLICK_HOUSE_READER_PARAMETER_WHERE)
+        where = where.replace('${start_date}', start_date)
+        where = where.replace('${start-date}', start_date)
+        where = where.replace('${stop_date}', stop_date)
+        where = where.replace('${stop-date}', stop_date)
+        self.parameter[CLICK_HOUSE_READER_PARAMETER_WHERE] = where
+        jdbc_url: str = self.parameter[DS_CLICK_HOUSE_JDBC_URL]
+        matcher = re.search('jdbc:postgresql://(.+?)/(.+)', jdbc_url)
+        if matcher:
+            if database:
+                jdbc_url = jdbc_url.replace(matcher.group(2), database)
+        elif jdbc_url.endswith('/'):
+            jdbc_url = f'{jdbc_url}{database}'
+        else:
+            jdbc_url = f'{jdbc_url}/{database}'
+        query_sql = self.config_parser.get(self.plugin_type, CLICK_HOUSE_READER_PARAMETER_QUERY_SQL)
+        query_sql = query_sql.replace('${start_date}', start_date)
+        query_sql = query_sql.replace('${start-date}', start_date)
+        query_sql = query_sql.replace('${stop_date}', stop_date)
+        query_sql = query_sql.replace('${stop-date}', stop_date)
+        if query_sql:
+            connection = {
+                DS_CLICK_HOUSE_JDBC_URL: jdbc_url.split(','),
+                CLICK_HOUSE_READER_PARAMETER_QUERY_SQL: query_sql
+            }
+        else:
+            connection = {
+                DS_CLICK_HOUSE_JDBC_URL: jdbc_url.split(','),
+                CLICK_HOUSE_READER_PARAMETER_TABLE: table.split(',')
+            }
+        self.parameter[CLICK_HOUSE_READER_PARAMETER_CONNECTION] = [connection]
+        del self.parameter[DS_CLICK_HOUSE_JDBC_URL]
+
+    def load_column(self):
+        columns = self.config_parser.get(self.plugin_type, CLICK_HOUSE_READER_PARAMETER_COLUMN).split(',')
+        self.parameter[CLICK_HOUSE_READER_PARAMETER_COLUMN] = columns

+ 78 - 0
dw_base/datax/plugins/reader/hdfs_reader.py

@@ -0,0 +1,78 @@
+# -*- coding:utf-8 -*-
+
+from configparser import ConfigParser
+from typing import List, Dict
+
+from dw_base.datax.plugins.reader.reader import Reader
+
+# hdfs reader
+HDFS_READER_NAME = 'hdfsreader'
+HDFS_READER_PARAMETER_COMPRESS = 'compress'
+HDFS_READER_PARAMETER_ENCODING = 'encoding'
+HDFS_READER_PARAMETER_FIELD_DELIMITER = 'fieldDelimiter'
+HDFS_READER_PARAMETER_FILE_TYPE = 'fileType'
+HDFS_READER_PARAMETER_NULL_FORMAT = "nullFormat"
+HDFS_READER_PARAMETER_PATH = 'path'
+
+
+class HDFSReader(Reader):
+
+    def __init__(self, base_dir: str, config_parser: ConfigParser, start_date: str = None, stop_date: str = None):
+        super(HDFSReader, self).__init__(base_dir, config_parser, start_date, stop_date)
+        self.plugin_name = HDFS_READER_NAME
+
+    def load_others(self):
+        path = self.config_parser.get(self.plugin_type, HDFS_READER_PARAMETER_PATH)
+        self.check_config(HDFS_READER_PARAMETER_PATH, path)
+        if path.__contains__('${start_date}'):
+            path = path.replace('${start_date}', self.start_date)
+        if path.__contains__('${start-date}'):
+            path = path.replace('${start-date}', self.start_date)
+        if path.__contains__('${dt}'):
+            path = path.replace('${dt}', self.start_date)
+        self.parameter[HDFS_READER_PARAMETER_PATH] = path
+        self.parameter[HDFS_READER_PARAMETER_FILE_TYPE] = \
+            self.config_parser.get(self.plugin_type, HDFS_READER_PARAMETER_FILE_TYPE) or 'text'
+        self.parameter[HDFS_READER_PARAMETER_ENCODING] = \
+            self.config_parser.get(self.plugin_type, HDFS_READER_PARAMETER_ENCODING) or 'UTF-8'
+        self.parameter[HDFS_READER_PARAMETER_COMPRESS] = \
+            self.config_parser.get(self.plugin_type, HDFS_READER_PARAMETER_NULL_FORMAT) or ''
+        self.parameter[HDFS_READER_PARAMETER_NULL_FORMAT] = \
+            self.config_parser.get(self.plugin_type, HDFS_READER_PARAMETER_NULL_FORMAT) or ''
+        self.parameter[HDFS_READER_PARAMETER_FIELD_DELIMITER] = \
+            self.config_parser.get(self.plugin_type, HDFS_READER_PARAMETER_FIELD_DELIMITER) or '\t'
+        self.parameter[HDFS_READER_PARAMETER_FIELD_DELIMITER] = \
+            self.parameter[HDFS_READER_PARAMETER_FIELD_DELIMITER].replace("\\t", "\t")
+
+    @staticmethod
+    def generate_definition(hdfs_ds_name: str, hdfs_path: str,
+                            hive_database: str, hive_table_name: str, hive_table_comment: str, partitioned: bool,
+                            column_names: List[str], column_types: Dict[str, str]) -> str:
+        if partitioned:
+            # 分区表
+            path = f'{hdfs_path}/{hive_database}.db/{hive_table_name}/dt=%s' % '${dt}'
+        else:
+            # 非分区表
+            path = f'{hdfs_path}/{hive_database}.db/{hive_table_name}'
+        column = []
+        column_type = []
+        for col_name in column_names:
+            column.append(col_name)
+            if column_types.__contains__(col_name):
+                curr_type = column_types.get(col_name)
+                if curr_type.lower() != 'string':
+                    column_type.append(f'{col_name}:{curr_type.upper()}')
+        column_type = ','.join(column_type)
+        definition = [
+            f'# {hive_table_name}: {hive_table_comment}',
+            '[reader]',
+            'dataSource = %s' % hdfs_ds_name,
+            f'column = {",".join(column)}',
+            f'columnType = {column_type}',
+            f'path = {path}',
+            'fileType = orc',
+            'encoding = UTF-8',
+            r'fieldDelimiter = \t',
+            'nullFormat ='
+        ]
+        return '\n'.join(definition)

+ 42 - 0
dw_base/datax/plugins/reader/mongo_reader.py

@@ -0,0 +1,42 @@
+# -*- coding:utf-8 -*-
+
+from configparser import ConfigParser
+
+from dw_base.datax.datax_constants import *
+from dw_base.datax.plugins.reader.reader import Reader
+from dw_base.utils.datetime_utils import parse_datetime, date_to_timestamp
+
+# mongo reader
+MONGO_READER_NAME = 'mongodbreader'
+MONGO_READER_PARAMETER_COLLECTION_NAME = 'collectionName'
+MONGO_READER_PARAMETER_DB_NAME = 'dbName'
+MONGO_READER_PARAMETER_QUERY = 'query'
+
+
+class MongoReader(Reader):
+
+    def __init__(self, base_dir: str, config_parser: ConfigParser, start_date: str = None, stop_date: str = None):
+        super(MongoReader, self).__init__(base_dir, config_parser, start_date, stop_date)
+        self.plugin_name = MONGO_READER_NAME
+
+    def load_others(self):
+        db_name = self.config_parser.get(self.plugin_type, MONGO_READER_PARAMETER_DB_NAME)
+        self.check_config(MONGO_READER_PARAMETER_DB_NAME, db_name)
+        collection_name = self.config_parser.get(self.plugin_type, MONGO_READER_PARAMETER_COLLECTION_NAME)
+        self.check_config(MONGO_READER_PARAMETER_COLLECTION_NAME, collection_name)
+        self.parameter[MONGO_READER_PARAMETER_DB_NAME] = db_name
+        self.parameter[MONGO_READER_PARAMETER_COLLECTION_NAME] = collection_name
+        if self.start_date == ALL_DATA_DATE:
+            self.parameter[MONGO_READER_PARAMETER_QUERY] = '{}'
+        else:
+            query = self.config_parser.get(self.plugin_type, MONGO_READER_PARAMETER_QUERY)
+            if query:
+                if 'ObjectId' in query:
+                    start_dt_str = hex(int(date_to_timestamp(self.start_date)))[2:] + '0000000000000000'
+                    stop_dt_str = hex(int(date_to_timestamp(self.stop_date)))[2:] + '0000000000000000'
+                else:
+                    start_dt_str = parse_datetime(self.start_date).strftime('%Y-%m-%d')
+                    stop_dt_str = parse_datetime(self.stop_date).strftime('%Y-%m-%d')
+                query = query.replace('${start_date}', start_dt_str)
+                query = query.replace('${stop_date}', stop_dt_str)
+            self.parameter[MONGO_READER_PARAMETER_QUERY] = query

+ 249 - 0
dw_base/datax/plugins/reader/mysql_reader.py

@@ -0,0 +1,249 @@
+# -*- coding:utf-8 -*-
+
+import re
+from configparser import ConfigParser
+from typing import Dict, List
+
+from dw_base.common.template_constants import MYSQL_HIVE_CREATE_TABLE_TEMPLATE, MYSQL_HIVE_HBASE_CREATE_TABLE_TEMPLATE
+from dw_base.database.mysql_utils import MySQLColumn
+from dw_base.datax.datax_constants import *
+from dw_base.datax.plugins.reader.reader import Reader
+from dw_base.utils.datetime_utils import local_2_utc, parse_datetime
+from dw_base.utils.file_utils import read_file_content
+
+# mysql reader
+MYSQL_READER_NAME = 'mysqlreader'
+MYSQL_READER_PARAMETER_COLUMN = 'column'
+MYSQL_READER_PARAMETER_CONNECTION = 'connection'
+MYSQL_READER_PARAMETER_DATABASE = 'database'
+MYSQL_READER_PARAMETER_QUERY_SQL = 'querySql'
+MYSQL_READER_PARAMETER_TABLE = 'table'
+MYSQL_READER_PARAMETER_UTC = 'utc'
+MYSQL_READER_PARAMETER_WHERE = 'where'
+MYSQL_KEYWORDS = ['default', 'desc', 'key', 'start', 'views', 'commit', 'time', 'add', 'admin', 'after', 'all', 'alter',
+                  'analyze', 'and', 'archive', 'array', 'as', 'asc', 'window', 'with', 'year', 'when', 'where', 'while',
+                  'authorization', 'before', 'between', 'bigint', 'binary', 'boolean', 'both', 'bucket', 'buckets',
+                  'by', 'cascade', 'case', 'cast', 'change', 'char', 'cluster', 'clustered', 'clusterstatus',
+                  'collection', 'column', 'columns', 'comment', 'compact', 'compactions', 'compute', 'concatenate',
+                  'conf', 'continue', 'create', 'cross', 'cube', 'current', 'current_date', 'current_timestamp',
+                  'cursor', 'data', 'database', 'databases', 'date', 'datetime', 'day', 'dbproperties', 'decimal',
+                  'deferred', 'defined', 'delete', 'delimited', 'dependency', 'desc', 'describe', 'directories',
+                  'directory', 'disable', 'distinct', 'distribute', 'double', 'drop', 'elem_type', 'else', 'enable',
+                  'end', 'escaped', 'exchange', 'exclusive', 'exists', 'explain', 'export', 'extended', 'external',
+                  'false', 'fetch', 'fields', 'file', 'fileformat', 'first', 'float', 'following', 'for', 'format',
+                  'formatted', 'from', 'full', 'function', 'functions', 'grant', 'group', 'grouping', 'having',
+                  'hold_ddltime', 'hour', 'idxproperties', 'if', 'ignore', 'import', 'in', 'index', 'indexes', 'inner',
+                  'inpath', 'inputdriver', 'inputformat', 'insert', 'int', 'intersect', 'interval', 'into', 'is',
+                  'items', 'jar', 'join', 'keys', 'key_type', 'lateral', 'left', 'less', 'like', 'limit', 'lines',
+                  'load', 'local', 'location', 'lock', 'locks', 'logical', 'long', 'macro', 'map', 'mapjoin',
+                  'materialized', 'minus', 'minute', 'month', 'more', 'msck', 'none', 'noscan', 'not', 'no_drop',
+                  'null', 'of', 'offline', 'on', 'option', 'or', 'order', 'out', 'outer', 'outputdriver',
+                  'outputformat', 'over', 'overwrite', 'owner', 'partialscan', 'partition', 'partitioned', 'partitions',
+                  'percent', 'plus', 'preceding', 'preserve', 'pretty', 'principals', 'procedure', 'protection',
+                  'purge', 'range', 'read', 'readonly', 'reads', 'rebuild', 'recordreader', 'recordwriter', 'reduce',
+                  'regexp', 'reload', 'rename', 'repair', 'replace', 'restrict', 'revoke', 'rewrite', 'right', 'rlike',
+                  'role', 'roles', 'rollup', 'row', 'rows', 'schema', 'schemas', 'second', 'select', 'semi', 'serde',
+                  'serdeproperties', 'server', 'set', 'sets', 'shared', 'show', 'show_database', 'skewed', 'smallint',
+                  'sort', 'sorted', 'ssl', 'statistics', 'stored', 'streamtable', 'string', 'struct', 'table', 'tables',
+                  'tablesample', 'tblproperties', 'temporary', 'terminated', 'then', 'timestamp', 'tinyint', 'to',
+                  'touch', 'transactions', 'transform', 'trigger', 'true', 'truncate', 'unarchive', 'unbounded', 'undo',
+                  'union', 'uniontype', 'uniquejoin', 'unlock', 'unset', 'unsigned', 'update', 'uri', 'use', 'user',
+                  'using', 'utc', 'utctimestamp', 'values', 'value_type', 'varchar', 'view', 'sql', 'start', 'views',
+                  'time', 'admin', 'alter', 'analyze', 'archive', 'array', 'asc', 'window', 'with', 'year', 'when',
+                  'where',
+                  'while', 'authorization', 'bucket', 'buckets', 'by', 'cast', 'cluster', 'clustered',
+                  'clusterstatus', 'collection', 'compactions', 'compute', 'concatenate', 'conf',
+                  'current_timestamp', 'cursor', 'dbproperties', 'decimal', 'deferred', 'defined', 'delimited',
+                  'dependency', 'directories', 'directory', 'distribute', 'elem_type', 'enable', 'end', 'exclusive',
+                  'external', 'false', 'fileformat', 'following', 'format', 'formatted', 'functions', 'grouping',
+                  'having', 'hold_ddltime', 'idxproperties', 'inner', 'inpath', 'inputdriver', 'inputformat',
+                  'intersect',
+                  'is', 'items', 'jar', 'key_type', 'lateral', 'lines', 'load', 'location', 'logical', 'macro', 'map',
+                  'mapjoin', 'materialized', 'minus', 'more', 'msck', 'noscan', 'no_drop', 'null', 'of', 'offline',
+                  'outputdriver', 'outputformat', 'over', 'overwrite', 'partialscan', 'partitioned',
+                  'partitions', 'percent', 'plus', 'preceding', 'pretty', 'principals', 'protection', 'purge',
+                  'readonly',
+                  'recordreader', 'recordwriter', 'reduce', 'regexp', 'restrict', 'rewrite', 'rlike', 'role', 'roles',
+                  'semi', 'serde', 'serdeproperties', 'server', 'set', 'sets', 'shared', 'show', 'show_database',
+                  'skewed', 'smallint', 'sort', 'sorted', 'ssl', 'statistics', 'stored', 'streamtable', 'string',
+                  'struct', 'table', 'tables', 'tablesample', 'tblproperties', 'temporary', 'terminated', 'then',
+                  'timestamp', 'tinyint', 'to', 'touch', 'transactions', 'transform', 'trigger', 'true', 'truncate',
+                  'unarchive', 'unbounded', 'undo', 'union', 'uniontype', 'uniquejoin', 'unlock', 'unset', 'unsigned',
+                  'update', 'uri', 'use', 'user', 'using', 'utc', 'utctimestamp', 'values', 'value_type', 'varchar',
+                  'view', 'release', 'leave', 'condition', 'type', 'types', 'linear', 'repeat', 'check']
+
+
+class MySQLReader(Reader):
+
+    def __init__(self, base_dir: str, config_parser: ConfigParser, start_date: str = None, stop_date: str = None):
+        super(MySQLReader, self).__init__(base_dir, config_parser, start_date, stop_date)
+        self.plugin_name = MYSQL_READER_NAME
+
+    def load_others(self):
+        start_date = self.start_date
+        stop_date = self.stop_date
+        jdbc_url: str = self.parameter[DS_MYSQL_JDBC_URL]
+        table_name = self.config_parser.get(self.plugin_type, MYSQL_READER_PARAMETER_TABLE)
+        connection_element = {}
+        if jdbc_url.__contains__(','):
+            # 多个jdbc url,不考虑有无db_name
+            connection_element[DS_MYSQL_JDBC_URL] = jdbc_url.split(',')
+        else:
+            db_name = None
+            if self.config_parser.has_option(self.plugin_type, MYSQL_READER_PARAMETER_DATABASE):
+                db_name = self.config_parser.get(self.plugin_type, MYSQL_READER_PARAMETER_DATABASE)
+            # 单个jdbc url
+            matcher = re.search('jdbc:mysql://(.+?)/(.+)', jdbc_url)
+            if matcher:
+                # 带数据库名的jdbc url(例如jdbc:mysql://host/db_a)
+                default_db = matcher.group(2)
+                connection_element[DS_MYSQL_JDBC_URL] = [jdbc_url]
+                # 假设db_b, db_c, db_d
+                if db_name:
+                    # 替换成多个新数据库
+                    connection_element[DS_MYSQL_JDBC_URL] = [
+                        jdbc_url.replace(default_db, db) for db in db_name.split(',')
+                    ]
+            else:
+                self.check_config(MYSQL_READER_PARAMETER_DATABASE, db_name)
+                # 不带数据库的jdbc url,需传入db_name
+                if not jdbc_url.endswith('/'):
+                    jdbc_url = f'{jdbc_url}/'
+                connection_element[DS_MYSQL_JDBC_URL] = [f'{jdbc_url}{db}' for db in db_name.split(',')]
+        if self.config_parser.has_option(self.plugin_type, MYSQL_READER_PARAMETER_QUERY_SQL):
+            query_sql = self.config_parser.get(self.plugin_type, MYSQL_READER_PARAMETER_QUERY_SQL)
+            if query_sql:
+                # 有查询语句
+                connection_element[MYSQL_READER_PARAMETER_QUERY_SQL] = query_sql.split(';')
+            else:
+                # 没有查询语句需指定table
+                self.check_config(MYSQL_READER_PARAMETER_TABLE, table_name)
+        # 不管有没有提供table_name,都可以写到json里,有优先级做控制
+        connection_element[MYSQL_READER_PARAMETER_TABLE] = table_name.split(',')
+        connection = [connection_element]
+        self.parameter[MYSQL_READER_PARAMETER_CONNECTION] = connection
+        # 删除parameter中的jdbcUrl
+        del self.parameter[DS_MYSQL_JDBC_URL]
+        where = ''
+        utc = 0
+        if self.config_parser.has_option(self.plugin_type, MYSQL_READER_PARAMETER_WHERE):
+            where = self.config_parser.get(self.plugin_type, MYSQL_READER_PARAMETER_WHERE)
+        if self.config_parser.has_option(self.plugin_type, MYSQL_READER_PARAMETER_UTC):
+            utc = self.config_parser.get(self.plugin_type, MYSQL_READER_PARAMETER_UTC)
+
+        if where:
+            if utc:
+                start_dt_str = local_2_utc(start_date).strftime('%Y-%m-%d %H:%M:%S')
+                stop_dt_str = local_2_utc(stop_date).strftime('%Y-%m-%d %H:%M:%S')
+                specific_start_dt_str = local_2_utc(start_date).strftime('%Y-%m-%d')
+                specific_stop_dt_str = local_2_utc(stop_date).strftime('%Y-%m-%d')
+            else:
+                start_dt_str = parse_datetime(start_date).strftime('%Y-%m-%d %H:%M:%S')
+                stop_dt_str = parse_datetime(stop_date).strftime('%Y-%m-%d %H:%M:%S')
+                specific_start_dt_str = parse_datetime(start_date).strftime('%Y-%m-%d')
+                specific_stop_dt_str = parse_datetime(stop_date).strftime('%Y-%m-%d')
+            where = where.replace('${start_date}', start_dt_str)
+            where = where.replace('${stop_date}', stop_dt_str)
+            where = where.replace('${start_specific_date}', specific_start_dt_str)
+            where = where.replace('${stop_specific_date}', specific_stop_dt_str)
+
+        # 此处为了统一(utc和北京时间的区别),使用self.start_date判断,而不是start_date
+        if self.start_date.startswith(ALL_DATA_DATE):
+            self.parameter[MYSQL_READER_PARAMETER_WHERE] = ''
+        else:
+            self.parameter[MYSQL_READER_PARAMETER_WHERE] = where
+
+    def load_column(self):
+        columns = self.config_parser.get(self.plugin_type, MYSQL_READER_PARAMETER_COLUMN).split(',')
+        self.parameter[MYSQL_READER_PARAMETER_COLUMN] = columns
+
+    @staticmethod
+    def generate_definition(database: str, table_name: str, table_comment: str,
+                            column_names: List[str], column_types: Dict[str, str],
+                            datasource_group: str, incremental: str, inc_col: str) -> str:
+
+        column = []
+        for col_name in column_names:
+            if MYSQL_KEYWORDS.__contains__(col_name):
+                column.append(f'`{col_name}`')
+            else:
+                column.append(col_name)
+        column_type = []
+        for col_name in column_names:
+            if column_types.__contains__(col_name):
+                column_type.append(f'{col_name}:{column_types[col_name]}')
+        definition = [
+            f'# {table_name}: {table_comment}',
+            '[reader]',
+            f'dataSource = {datasource_group}/mysql-{database}',
+            f'table = {table_name}',
+            f'column = {",".join(column)}',
+            f'columnType = {",".join(column_type)}',
+            ";全量抽取需传递19700101给参数`start_date`,或直接注释下面的条件;增量抽取请取消注释(有update的表不适用)"
+            ";以具体日期传入当参数时,需要将开始日期参数声明为`start_specific_date`,取短日期范围时,需要同时将结束日期声明为`stop_specific_date`"
+        ]
+        if incremental is not None and incremental:
+            definition.append(f"where = {inc_col} >= '%s' and {inc_col} < '%s'" % ('${start_date}', '${stop_date}'))
+        else:
+            definition.append(f";where = {inc_col} >= '%s' and {inc_col} < '%s'" % ('${start_date}', '${stop_date}'))
+        definition.append(';crm大部分表的时间都是+00(格林威治时间),而实际传入的时间是+08(北京时间),所有通过此字段统一')
+        definition.append(';utc=1 表示原始表是格林威治时间,utc=0或为空 表示原始表为北京时间')
+        definition.append('utc =')
+        return '\n'.join(definition)
+
+    @staticmethod
+    def generate_hive_ddl(hive_database_name: str,
+                          hive_table_name: str,
+                          table_comment: str,
+                          partitioned: bool,
+                          columns: List[MySQLColumn],
+                          column_types: Dict[str, str]) -> str:
+        columns_definition = []
+        partition_def = ''
+        for column in columns:
+            column_name = column.COLUMN_NAME
+            column_comment = column.COLUMN_COMMENT
+            if MYSQL_KEYWORDS.__contains__(column_name):
+                column_name = str(f'`{column_name}`')
+            if column_types.__contains__(column_name):
+                column_type = str(column_types[column_name]).upper()
+            else:
+                column_type = "STRING"
+            columns_definition.append(f"{column_name} {column_type} COMMENT '{column_comment}'")
+        if partitioned is not None and partitioned:
+            partition_def = '\nPARTITIONED BY (dt STRING)'
+        ddl = read_file_content(MYSQL_HIVE_CREATE_TABLE_TEMPLATE).format(
+            hive_database_name, hive_table_name, hive_database_name, hive_table_name,
+            ',\n'.join(columns_definition), table_comment, partition_def
+        )
+        return ddl
+
+    @staticmethod
+    def generate_hive_over_hbase_ddl(hive_database_name: str,
+                                     hive_table_name: str,
+                                     table_comment: str,
+                                     hbase_namespace: str,
+                                     hbase_table_name: str,
+                                     columns: List[MySQLColumn],
+                                     column_types: Dict[str, str]) -> str:
+        columns_definition = []
+        hbase_column_mapping_definition = []
+        partition_def = ''
+        for column in columns:
+            column_name = column.COLUMN_NAME
+            column_comment = column.COLUMN_COMMENT
+            if MYSQL_KEYWORDS.__contains__(column_name):
+                column_name = str(f'`{column_name}`')
+            if column_types.__contains__(column_name):
+                column_type = str(column_types[column_name]).upper()
+            else:
+                column_type = "STRING"
+            columns_definition.append(f"{column_name} {column_type} COMMENT '{column_comment}'")
+            hbase_column_mapping_definition.append(f"cf:{column_name}")
+        ddl_template = read_file_content(MYSQL_HIVE_HBASE_CREATE_TABLE_TEMPLATE)
+        ddl = ddl_template.format(
+            hive_database_name, hive_table_name, hive_database_name, hive_table_name,
+            ',\n'.join(columns_definition), table_comment, partition_def,
+            ',\n'.join(hbase_column_mapping_definition), hbase_namespace, hbase_table_name
+        )
+        return ddl

+ 76 - 0
dw_base/datax/plugins/reader/postgresql_reader.py

@@ -0,0 +1,76 @@
+# -*- coding:utf-8 -*-
+
+import re
+from configparser import ConfigParser
+
+from dw_base.datax.datax_constants import *
+from dw_base.datax.plugins.reader.reader import Reader
+
+# PostgreSQL reader
+from dw_base.datax.plugins.writer.postgresql_writer import POSTGRE_SQL_WRITER_PARAMETER_COLUMN, \
+    POSTGRE_SQL_WRITER_PARAMETER_CONNECTION, POSTGRE_SQL_WRITER_PARAMETER_DATABASE, POSTGRE_SQL_WRITER_PARAMETER_TABLE
+
+POSTGRE_SQL_READER_NAME = 'postgresqlreader'
+POSTGRE_SQL_READER_PARAMETER_CONNECTION = 'connection'
+POSTGRE_SQL_READER_PARAMETER_DATABASE = 'database'
+POSTGRE_SQL_READER_PARAMETER_FETCH_SIZE = 'fetchSize'
+POSTGRE_SQL_READER_PARAMETER_QUERY_SQL = 'querySql'
+POSTGRE_SQL_READER_PARAMETER_TABLE = 'table'
+POSTGRE_SQL_READER_PARAMETER_COLUMN = 'column'
+POSTGRE_SQL_READER_PARAMETER_WHERE = 'where'
+POSTGRE_SQL_READER_PARAMETER_SPLIT_PK = 'splitPk'
+
+
+class PostgreSQLReader(Reader):
+
+    def __init__(self, base_dir: str, config_parser: ConfigParser, start_date: str = None, stop_date: str = None):
+        super(PostgreSQLReader, self).__init__(base_dir, config_parser, start_date, stop_date)
+        self.plugin_name = POSTGRE_SQL_READER_NAME
+
+    def load_others(self):
+        start_date = self.start_date
+        stop_date = self.stop_date
+        database = self.config_parser.get(self.plugin_type, POSTGRE_SQL_WRITER_PARAMETER_DATABASE)
+        self.check_config(POSTGRE_SQL_WRITER_PARAMETER_DATABASE, database)
+        table = self.config_parser.get(self.plugin_type, POSTGRE_SQL_WRITER_PARAMETER_TABLE)
+        self.check_config(POSTGRE_SQL_WRITER_PARAMETER_TABLE, table)
+        fetch_size = self.config_parser.get(self.plugin_type, POSTGRE_SQL_READER_PARAMETER_FETCH_SIZE) or '1000'
+        self.parameter[POSTGRE_SQL_READER_PARAMETER_FETCH_SIZE] = fetch_size
+        split_pk = self.config_parser.get(self.plugin_type, POSTGRE_SQL_READER_PARAMETER_SPLIT_PK)
+        self.parameter[POSTGRE_SQL_READER_PARAMETER_SPLIT_PK] = split_pk
+        where = self.config_parser.get(self.plugin_type, POSTGRE_SQL_READER_PARAMETER_WHERE)
+        where = where.replace('${start_date}', start_date)
+        where = where.replace('${start-date}', start_date)
+        where = where.replace('${stop_date}', stop_date)
+        where = where.replace('${stop-date}', stop_date)
+        self.parameter[POSTGRE_SQL_READER_PARAMETER_WHERE] = where
+        jdbc_url: str = self.parameter[DS_POSTGRE_SQL_JDBC_URL]
+        matcher = re.search('jdbc:postgresql://(.+?)/(.+)', jdbc_url)
+        if matcher:
+            if database:
+                jdbc_url = jdbc_url.replace(matcher.group(2), database)
+        elif jdbc_url.endswith('/'):
+            jdbc_url = f'{jdbc_url}{database}'
+        else:
+            jdbc_url = f'{jdbc_url}/{database}'
+        query_sql = self.config_parser.get(self.plugin_type, POSTGRE_SQL_READER_PARAMETER_QUERY_SQL)
+        query_sql = query_sql.replace('${start_date}', start_date)
+        query_sql = query_sql.replace('${start-date}', start_date)
+        query_sql = query_sql.replace('${stop_date}', stop_date)
+        query_sql = query_sql.replace('${stop-date}', stop_date)
+        if query_sql:
+            connection = {
+                DS_POSTGRE_SQL_JDBC_URL: jdbc_url.split(','),
+                POSTGRE_SQL_READER_PARAMETER_QUERY_SQL: query_sql
+            }
+        else:
+            connection = {
+                DS_POSTGRE_SQL_JDBC_URL: jdbc_url.split(','),
+                POSTGRE_SQL_READER_PARAMETER_TABLE: table.split(',')
+            }
+        self.parameter[POSTGRE_SQL_WRITER_PARAMETER_CONNECTION] = [connection]
+        del self.parameter[DS_POSTGRE_SQL_JDBC_URL]
+
+    def load_column(self):
+        columns = self.config_parser.get(self.plugin_type, POSTGRE_SQL_WRITER_PARAMETER_COLUMN).split(',')
+        self.parameter[POSTGRE_SQL_WRITER_PARAMETER_COLUMN] = columns

+ 14 - 0
dw_base/datax/plugins/reader/reader.py

@@ -0,0 +1,14 @@
+# -*- coding:utf-8 -*-
+
+from configparser import ConfigParser
+
+from dw_base.datax.plugins.plugin import Plugin
+
+
+class Reader(Plugin):
+    def __init__(self, base_dir: str, config_parser: ConfigParser, start_date: str = None, stop_date: str = None):
+        super(Reader, self).__init__(base_dir, config_parser, start_date, stop_date)
+        self.plugin_type = 'reader'
+
+    def load_others(self):
+        raise NotImplementedError('please implement this method in a specified reader.')

+ 2 - 0
dw_base/datax/plugins/writer/__init__.py

@@ -0,0 +1,2 @@
+# -*- coding:utf-8 -*-
+

+ 57 - 0
dw_base/datax/plugins/writer/clickhouse_writer.py

@@ -0,0 +1,57 @@
+# -*- coding:utf-8 -*-
+
+import re
+from configparser import ConfigParser
+
+from dw_base.datax.datax_constants import *
+from dw_base.datax.plugins.writer.writer import Writer
+
+# clickhouse writer
+CLICK_HOUSE_WRITER_NAME = 'clickhousewriter'
+CLICK_HOUSE_WRITER_PARAMETER_BATCH_SIZE = 'batchSize'
+CLICK_HOUSE_WRITER_PARAMETER_CONNECTION = 'connection'
+CLICK_HOUSE_WRITER_PARAMETER_COLUMN = 'column'
+CLICK_HOUSE_WRITER_PARAMETER_DATABASE = 'database'
+CLICK_HOUSE_WRITER_PARAMETER_POST_SQL = 'postSql'
+CLICK_HOUSE_WRITER_PARAMETER_PRE_SQL = 'preSql'
+CLICK_HOUSE_WRITER_PARAMETER_TABLE = 'table'
+CLICK_HOUSE_WRITER_PARAMETER_WRITE_MODE = 'writeMode'
+
+
+class ClickHouseWriter(Writer):
+    def __init__(self, base_dir: str, config_parser: ConfigParser, start_time: str = None, stop_time: str = None):
+        super(ClickHouseWriter, self).__init__(base_dir, config_parser, start_time, stop_time)
+        self.plugin_name = CLICK_HOUSE_WRITER_NAME
+
+    def load_others(self):
+        database = self.config_parser.get(self.plugin_type, CLICK_HOUSE_WRITER_PARAMETER_DATABASE)
+        self.check_config(CLICK_HOUSE_WRITER_PARAMETER_DATABASE, database)
+        table = self.config_parser.get(self.plugin_type, CLICK_HOUSE_WRITER_PARAMETER_TABLE)
+        self.check_config(CLICK_HOUSE_WRITER_PARAMETER_TABLE, table)
+        jdbc_url: str = self.parameter[DS_CLICK_HOUSE_JDBC_URL]
+        matcher = re.search('jdbc:postgresql://(.+?)/(.+)', jdbc_url)
+        if matcher:
+            if database:
+                jdbc_url = jdbc_url.replace(matcher.group(2), database)
+        elif jdbc_url.endswith('/'):
+            jdbc_url = f'{jdbc_url}{database}'
+        else:
+            jdbc_url = f'{jdbc_url}/{database}'
+        connection = {
+            DS_CLICK_HOUSE_JDBC_URL: jdbc_url,
+            CLICK_HOUSE_WRITER_PARAMETER_TABLE: table.split(',')
+        }
+        self.parameter[CLICK_HOUSE_WRITER_PARAMETER_CONNECTION] = [connection]
+        del self.parameter[DS_CLICK_HOUSE_JDBC_URL]
+        self.parameter[CLICK_HOUSE_WRITER_PARAMETER_POST_SQL] = \
+            self.config_parser.get(self.plugin_type, CLICK_HOUSE_WRITER_PARAMETER_POST_SQL).split(';') or []
+        self.parameter[CLICK_HOUSE_WRITER_PARAMETER_PRE_SQL] = \
+            self.config_parser.get(self.plugin_type, CLICK_HOUSE_WRITER_PARAMETER_PRE_SQL).split(';') or []
+        self.parameter[CLICK_HOUSE_WRITER_PARAMETER_WRITE_MODE] = \
+            self.config_parser.get(self.plugin_type, CLICK_HOUSE_WRITER_PARAMETER_WRITE_MODE) or 'insert'
+        self.parameter[CLICK_HOUSE_WRITER_PARAMETER_BATCH_SIZE] = \
+            self.config_parser.get(self.plugin_type, CLICK_HOUSE_WRITER_PARAMETER_BATCH_SIZE) or '1024'
+
+    def load_column(self):
+        columns = self.config_parser.get(self.plugin_type, CLICK_HOUSE_WRITER_PARAMETER_COLUMN).split(',')
+        self.parameter[CLICK_HOUSE_WRITER_PARAMETER_COLUMN] = columns

+ 28 - 0
dw_base/datax/plugins/writer/elasticsearch_writer.py

@@ -0,0 +1,28 @@
+# -*- coding:utf-8 -*-
+
+from configparser import ConfigParser
+
+from dw_base.datax.plugins.writer.writer import Writer
+
+# elasticsearch writer
+ES_WRITER_NAME = 'elasticsearch-writer'
+ES_WRITER_PARAMETER_NODES = 'nodes'
+ES_WRITER_PARAMETER_BATCH_SIZE = 'batchSize'
+ES_WRITER_PARAMETER_INDEX = 'index'
+ES_WRITER_PARAMETER_SPEED_PER_SECOND = 'speedPerSecond'
+ES_WRITER_PARAMETER_WRITE_MODE = 'writeMode'
+
+
+class ElasticsearchWriter(Writer):
+    def __init__(self, base_dir: str, config_parser: ConfigParser, start_date: str = None, stop_date: str = None):
+        super(ElasticsearchWriter, self).__init__(base_dir, config_parser, start_date, stop_date)
+        self.plugin_name = ES_WRITER_NAME
+
+    def load_others(self):
+        self.parameter[ES_WRITER_PARAMETER_BATCH_SIZE] = self.config_parser.get(self.plugin_type,
+                                                                                ES_WRITER_PARAMETER_BATCH_SIZE)
+        self.parameter[ES_WRITER_PARAMETER_INDEX] = self.config_parser.get(self.plugin_type, ES_WRITER_PARAMETER_INDEX)
+        self.parameter[ES_WRITER_PARAMETER_SPEED_PER_SECOND] = self.config_parser.get(self.plugin_type,
+                                                                                      ES_WRITER_PARAMETER_SPEED_PER_SECOND)
+        self.parameter[ES_WRITER_PARAMETER_WRITE_MODE] = self.config_parser.get(self.plugin_type,
+                                                                                ES_WRITER_PARAMETER_WRITE_MODE)

+ 155 - 0
dw_base/datax/plugins/writer/hbase_writer.py

@@ -0,0 +1,155 @@
+# -*- coding:utf-8 -*-
+
+import re
+from configparser import ConfigParser
+from typing import Dict, List
+
+from dw_base.datax.plugins.writer.writer import Writer
+
+# hbase writer
+HBASE_WRITER_NAME = 'hbaseapiwriter'
+HBASE_WRITER_PARAMETER_COLUMN_FAMILY = 'columnFamily'
+HBASE_WRITER_PARAMETER_CONF = 'conf'
+HBASE_WRITER_PARAMETER_END_KEY = 'endKey'
+HBASE_WRITER_PARAMETER_NAMESPACE = 'namespace'
+HBASE_WRITER_PARAMETER_REGION_NUMBER = 'regionNumber'
+HBASE_WRITER_PARAMETER_ROW_KEY_COLUMN = 'rowKeyColumn'
+HBASE_WRITER_PARAMETER_START_KEY = 'startKey'
+HBASE_WRITER_PARAMETER_TABLE = 'table'
+HBASE_WRITER_PARAMETER_TRUNCATE = 'truncate'
+HBASE_WRITER_PARAMETER_WRITE_BATCH_SIZE = 'writeBatchSize'
+
+
+class HBaseWriter(Writer):
+    def __init__(self, base_dir: str, config_parser: ConfigParser, start_date: str = None, stop_date: str = None):
+        super(HBaseWriter, self).__init__(base_dir, config_parser, start_date, stop_date)
+        self.plugin_name = HBASE_WRITER_NAME
+
+    def load_data_source(self):
+        ds_dict: Dict[str, str] = self.datasource.parse()
+        conf = {}
+        for key, value in ds_dict.items():
+            conf[key] = value
+        self.parameter[HBASE_WRITER_PARAMETER_CONF] = conf
+
+    def load_others(self):
+        # 放在datasource 部分
+        # conf = self.config_parser.get(self.plugin_type, 'conf')
+        # if conf:
+        #     self.parameter[HBASE_WRITER_PARAMETER_CONF] = json.loads(conf)
+        # self.parameter[HBASE_WRITER_PARAMETER_CONF] = 'conf'
+        end_key = self.config_parser.get(self.plugin_type, HBASE_WRITER_PARAMETER_END_KEY)
+        if end_key:
+            self.parameter[HBASE_WRITER_PARAMETER_END_KEY] = end_key
+        start_key = self.config_parser.get(self.plugin_type, HBASE_WRITER_PARAMETER_START_KEY)
+        if start_key:
+            self.parameter[HBASE_WRITER_PARAMETER_START_KEY] = start_key
+        self.parameter[HBASE_WRITER_PARAMETER_REGION_NUMBER] = self.config_parser.get(self.plugin_type,
+                                                                                      HBASE_WRITER_PARAMETER_REGION_NUMBER) or 3
+        namespace = self.config_parser.get(self.plugin_type, HBASE_WRITER_PARAMETER_NAMESPACE)
+        self.parameter[HBASE_WRITER_PARAMETER_NAMESPACE] = namespace
+        table = self.config_parser.get(self.plugin_type, HBASE_WRITER_PARAMETER_TABLE)
+        self.check_config(self.plugin_type, table)
+        if table:
+            self.parameter[HBASE_WRITER_PARAMETER_TABLE] = table
+        truncate = self.config_parser.get(self.plugin_type, HBASE_WRITER_PARAMETER_TRUNCATE)
+        self.parameter[HBASE_WRITER_PARAMETER_TRUNCATE] = truncate and truncate.lower() == 'true'
+        batch_size = self.config_parser.get(self.plugin_type, HBASE_WRITER_PARAMETER_WRITE_BATCH_SIZE)
+        try:
+            self.parameter[HBASE_WRITER_PARAMETER_WRITE_BATCH_SIZE] = int(batch_size)
+        except:
+            self.parameter[HBASE_WRITER_PARAMETER_WRITE_BATCH_SIZE] = 10000
+
+    def load_column(self):
+        super(HBaseWriter, self).load_column()
+        self.parameter[HBASE_WRITER_PARAMETER_COLUMN_FAMILY] = self.config_parser.get(self.plugin_type,
+                                                                                      HBASE_WRITER_PARAMETER_COLUMN_FAMILY)
+        row_key_columns = []
+        row_key_column_definition = self.config_parser.get(self.plugin_type,
+                                                           HBASE_WRITER_PARAMETER_ROW_KEY_COLUMN) \
+            .split(',')  # type: [str]
+        separator_pattern = 'separator\((.+?)\)'
+        reverse_pattern = 'reverse\((.+?)\)'
+        for row_key_column in row_key_column_definition:
+            separator_matcher = re.search(separator_pattern, row_key_column)
+            reverse_matcher = re.match(reverse_pattern, row_key_column)
+            if separator_matcher:
+                separator = separator_matcher.group(1)
+                row_key_columns.append(
+                    {
+                        "index": -1,
+                        "value": separator,
+                        "type": "string"
+                    }
+                )
+                continue
+            elif reverse_matcher:
+                row_key_column = reverse_matcher.group(1)
+                row_key_index = -1
+                for index in range(len(self.columns)):
+                    col = self.columns[index]  # type: str
+                    if col.endswith(':' + row_key_column):
+                        row_key_index = index
+                        break
+                if row_key_index != -1:
+                    row_key_columns.append(
+                        {
+                            "index": row_key_index,
+                            "reverse": True,
+                            "type": "string"
+                        }
+                    )
+                else:
+                    raise Exception('specified row key column %s not found in columns' % row_key_column)
+            else:
+                row_key_index = -1
+                for index in range(len(self.columns)):
+                    col = self.columns[index]  # type: str
+                    if col.endswith(':' + row_key_column):
+                        row_key_index = index
+                        break
+                if row_key_index != -1:
+                    row_key_columns.append(
+                        {
+                            "index": row_key_index,
+                            "type": "string"
+                        }
+                    )
+                else:
+                    raise Exception('specified row key column %s not found in columns' % row_key_column)
+            self.parameter[HBASE_WRITER_PARAMETER_ROW_KEY_COLUMN] = row_key_columns
+
+    @staticmethod
+    def generate_definition(hbase_ds_name: str,
+                            hbase_namespace: str,
+                            hbase_table_name: str,
+                            hive_table_name: str,
+                            hive_table_comment: str,
+                            column_family: str,
+                            column_names: List[str],
+                            column_types: Dict[str, str],
+                            row_key_columns: List[str]) -> str:
+        column = []
+        column_type = []
+        for col_name in column_names:
+            column.append(col_name)
+            if column_types.__contains__(col_name):
+                column_type.append(f'{col_name}:{column_types.get(col_name)}')
+        column_type = ','.join(column_type)
+        definition = [
+            '[writer]',
+            f'# {hive_table_name}: {hive_table_comment}',
+            'dataSource = %s' % hbase_ds_name,
+            'namespace = %s' % hbase_namespace,
+            'table = %s' % hbase_table_name,
+            f'columnFamily = {column_family}',
+            f'column = {",".join(column)}',
+            f'columnType = {column_type}',
+            f'rowKeyColumn = {",".join(row_key_columns)}',
+            'truncate = false',
+            'startKey = 00',
+            'endKey = 99',
+            'regionNumber = 101',
+            'writeBatchSize = 100000',
+        ]
+        return '\n'.join(definition)

+ 95 - 0
dw_base/datax/plugins/writer/hdfs_writer.py

@@ -0,0 +1,95 @@
+# -*- coding:utf-8 -*-
+
+from configparser import ConfigParser
+from datetime import datetime, timedelta
+from typing import Dict, List
+
+from dw_base.datax.plugins.writer.writer import Writer
+
+# hdfs writer
+HDFS_WRITER_NAME = 'hdfswriter'
+HDFS_WRITER_PARAMETER_COMPRESS = 'compress'
+HDFS_WRITER_PARAMETER_ENCODING = 'encoding'
+HDFS_WRITER_PARAMETER_FIELD_DELIMITER = 'fieldDelimiter'
+HDFS_WRITER_PARAMETER_FILE_TYPE = 'fileType'
+HDFS_WRITER_PARAMETER_PATH = 'path'
+HDFS_WRITER_PARAMETER_WRITE_MODE = 'writeMode'
+HDFS_WRITER_PARAMETER_FILE_NAME = 'fileName'
+
+
+class HDFSWriter(Writer):
+    """
+    HDFSWriter有4个时间字段:
+    1. start_date:表示数据的起始日期,从Reader处传递而来,全量时传递19700101即可
+    2. stop_date:表示数据的终止日期,从Reader传递而来
+    3. dt:表示分区日期,值为stop_date - 1 day
+    4. biz_date:表示文件的前缀,当start_date + 1 day = stop_date时,值为start_date,否则值为${start_date}-${stop_date - 1 day}
+    """
+
+    def __init__(self, base_dir: str, config_parser: ConfigParser, start_date: str = None, stop_date: str = None):
+        super(HDFSWriter, self).__init__(base_dir, config_parser, start_date, stop_date)
+        self.plugin_name = HDFS_WRITER_NAME
+
+    def load_others(self):
+        path = self.config_parser.get(self.plugin_type, HDFS_WRITER_PARAMETER_PATH)
+        self.check_config(HDFS_WRITER_PARAMETER_PATH, path)
+        if path.__contains__('${dt}'):
+            stop_at = datetime.strptime(self.stop_date, '%Y%m%d')
+            dt = (stop_at - timedelta(days=1)).strftime('%Y%m%d')
+            path = path.replace('${dt}', dt)
+        self.parameter[HDFS_WRITER_PARAMETER_PATH] = path
+        self.parameter[HDFS_WRITER_PARAMETER_FILE_TYPE] = \
+            self.config_parser.get(self.plugin_type, HDFS_WRITER_PARAMETER_FILE_TYPE) or 'text'
+        if self.config_parser.has_option(self.plugin_type, HDFS_WRITER_PARAMETER_COMPRESS):
+            self.parameter[HDFS_WRITER_PARAMETER_COMPRESS] = self.config_parser.get(self.plugin_type, HDFS_WRITER_PARAMETER_COMPRESS)
+        self.parameter[HDFS_WRITER_PARAMETER_ENCODING] = \
+            self.config_parser.get(self.plugin_type, HDFS_WRITER_PARAMETER_ENCODING) or 'UTF-8'
+        self.parameter[HDFS_WRITER_PARAMETER_WRITE_MODE] = \
+            self.config_parser.get(self.plugin_type, HDFS_WRITER_PARAMETER_WRITE_MODE) or 'append'
+        self.parameter[HDFS_WRITER_PARAMETER_FIELD_DELIMITER] = \
+            self.config_parser.get(self.plugin_type, HDFS_WRITER_PARAMETER_FIELD_DELIMITER).replace("\\t", "\t") or '\t'
+        self.parameter['kerberosPrincipal'] = 'hdfs@LIXIAOYUN.COM'
+        self.get_file_name()
+
+    def get_file_name(self):
+        start_at = datetime.strptime(self.start_date, '%Y%m%d')
+        stop_at = datetime.strptime(self.stop_date, '%Y%m%d')
+        biz_date = self.start_date
+        if stop_at - timedelta(days=1) > start_at:
+            biz_date = f'{self.start_date}-{(stop_at - timedelta(days=1)).strftime("%Y%m%d")}'
+        self.parameter[HDFS_WRITER_PARAMETER_FILE_NAME] = \
+            self.config_parser.get(self.plugin_type, HDFS_WRITER_PARAMETER_FILE_NAME).replace('${biz_date}', biz_date)
+
+    @staticmethod
+    def generate_definition(hdfs_ds_name: str, hdfs_path: str,
+                            hive_database: str, hive_table_name: str, partitioned: bool,
+                            column_names: List[str], column_types: Dict[str, str]) -> str:
+        if partitioned:
+            # 分区表
+            path = f'{hdfs_path}/{hive_database}.db/{hive_table_name}/dt=%s' % '${dt}'
+        else:
+            # 非分区表
+            path = f'{hdfs_path}/{hive_database}.db/{hive_table_name}'
+        column = []
+        column_type = []
+        for col_name in column_names:
+            column.append(col_name)
+            if column_types.__contains__(col_name):
+                column_type.append(f'{col_name}:{column_types.get(col_name)}')
+        column_type = ','.join(column_type)
+        definition = [
+            '[writer]',
+            'dataSource = %s' % hdfs_ds_name,
+            f'path = {path}',
+            f'column = {",".join(column)}',
+            f'columnType = {column_type}',
+            'fileType = orc',
+            # 'fileName = ${biz_date}',
+            f'fileName = {hive_table_name}',
+            'compress = NONE',
+            'encoding = utf-8',
+            ';writeMode支持append、nonConflict和truncate',
+            'writeMode = truncate',
+            r'fieldDelimiter = \t'
+        ]
+        return '\n'.join(definition)

+ 85 - 0
dw_base/datax/plugins/writer/kafka_writer.py

@@ -0,0 +1,85 @@
+# -*- coding:utf-8 -*-
+
+import re
+from configparser import ConfigParser
+from typing import List, Dict
+
+from dw_base.datax.plugins.writer.writer import Writer
+
+# kafka writer
+KAFKA_WRITER_NAME = 'kafka-writer'
+KAFKA_WRITER_PARAMETER_BROKERS = 'brokers'
+KAFKA_WRITER_PARAMETER_EXTRA_CONFIG = 'extraConfig'
+KAFKA_WRITER_PARAMETER_TOPIC = 'topic'
+KAFKA_WRITER_PARAMETER_KEY = 'key'
+KAFKA_WRITER_HIVE_ES_COLUMN_MAPPING = 'columnMapping'
+KAFKA_WRITER_SOURCE_NAME = 'sourceName'
+
+
+class KafkaWriter(Writer):
+
+    def __init__(self, base_dir: str, config_parser: ConfigParser, start_date: str = None, stop_date: str = None):
+        super().__init__(base_dir, config_parser, start_date, stop_date)
+        self.plugin_name = KAFKA_WRITER_NAME
+
+    def load_others(self):
+        self.parameter[KAFKA_WRITER_PARAMETER_TOPIC] = self.config_parser.get(self.plugin_type,
+                                                                              KAFKA_WRITER_PARAMETER_TOPIC)
+        self.parameter[KAFKA_WRITER_PARAMETER_KEY] = self.config_parser.get(self.plugin_type,
+                                                                            KAFKA_WRITER_PARAMETER_KEY)
+        extra_config_raw = self.config_parser.get(self.plugin_type, KAFKA_WRITER_PARAMETER_EXTRA_CONFIG)
+        extra_config = {}
+        for kv in extra_config_raw.split(','):
+            splits = kv.split(':')
+            if len(splits) != 2:
+                continue
+            extra_config[splits[0]] = splits[1]
+        self.parameter[KAFKA_WRITER_PARAMETER_EXTRA_CONFIG] = extra_config
+        if self.config_parser.has_option(self.plugin_type, KAFKA_WRITER_HIVE_ES_COLUMN_MAPPING):
+            self.parameter[KAFKA_WRITER_HIVE_ES_COLUMN_MAPPING] = self.config_parser.get(self.plugin_type,
+                                                                                         KAFKA_WRITER_HIVE_ES_COLUMN_MAPPING)
+        else:
+            self.parameter[KAFKA_WRITER_HIVE_ES_COLUMN_MAPPING] = ''
+        if self.config_parser.has_option(self.plugin_type, KAFKA_WRITER_SOURCE_NAME):
+            self.parameter[KAFKA_WRITER_SOURCE_NAME] = self.config_parser.get(self.plugin_type,
+                                                                              KAFKA_WRITER_SOURCE_NAME)
+        else:
+            self.parameter[KAFKA_WRITER_SOURCE_NAME] = ''
+
+    @staticmethod
+    def generate_definition(kafka_ds_name: str,
+                            kafka_topic: str,
+                            kafka_key: str,
+                            source_name: str,
+                            column_names: List[str],
+                            column_types: Dict[str, str],
+                            column_mapping: str) -> str:
+        column_type = []
+        column_name_mapping = {'pid': 'pid:ID', 'id': 'pid:ID', 'esId': 'es_id:esId'}
+        column_type_mapping = ['string', 'double', 'int', 'long', 'bigint', 'boolean']
+        for col_name in column_names:
+            if column_name_mapping.keys().__contains__(col_name):
+                column_type.append(column_name_mapping.get(col_name))
+                continue
+            if column_types.keys().__contains__(col_name):
+                col_type = column_types.get(col_name)
+                if col_type.lower() in column_type_mapping:
+                    column_type.append(f'{col_name}:{col_type.upper()}')
+                else:
+                    if re.search(r'array<struct|array<string|struct<', col_type.lower()):
+                        column_type.append(f'{col_name}:{col_type.replace(",", "#").replace(":", "@")}')
+                    else:
+                        column_type.append(f'{col_name}:JSON')
+
+        definition = [
+            '[writer]',
+            'dataSource = %s' % kafka_ds_name,
+            f'topic = {kafka_topic}',
+            f'key = {kafka_key}',
+            f'sourceName = {source_name}',
+            f'column = {",".join(column_names)}',
+            f'columnType = {",".join(column_type)}',
+            f'columnMapping = {column_mapping}',
+            f'extraConfig = auto.commit.interval.ms:5000',
+        ]
+        return '\n'.join(definition)

+ 72 - 0
dw_base/datax/plugins/writer/mongo_writer.py

@@ -0,0 +1,72 @@
+# -*- coding:utf-8 -*-
+
+from configparser import ConfigParser
+from typing import List, Dict
+
+from dw_base.datax.plugins.writer.writer import Writer
+
+# mongo writer
+MONGO_WRITER_NAME = 'mongodbwriter'
+MONGO_WRITER_PARAMETER_DB_NAME = 'dbName'
+MONGO_WRITER_PARAMETER_COLLECTION_NAME = 'collectionName'
+MONGO_SPECIAL_WORDS_DICT = {
+    'company_name': 'COMPANYNAME',
+    'pid': 'PID',
+    'uncid': 'UNCID',
+    'unc_id': 'UNCID',
+    'url': 'URL',
+    'url_desc': 'URL_DESC',
+    'web_name': 'WEBNAME',
+}
+# isReplace, replaceKey
+MONGO_WRITER_PARAMETER_WRITE_MODE = 'writeMode'
+
+
+class MongoWriter(Writer):
+
+    def __init__(self, base_dir: str, config_parser: ConfigParser, start_date: str = None, stop_date: str = None):
+        super(MongoWriter, self).__init__(base_dir, config_parser, start_date, stop_date)
+        self.plugin_name = MONGO_WRITER_NAME
+
+    def load_others(self):
+        db_name = self.config_parser.get(self.plugin_type, MONGO_WRITER_PARAMETER_DB_NAME)
+        self.check_config(MONGO_WRITER_PARAMETER_DB_NAME, db_name)
+        self.parameter[MONGO_WRITER_PARAMETER_DB_NAME] = db_name
+        collection_name = self.config_parser.get(self.plugin_type, MONGO_WRITER_PARAMETER_COLLECTION_NAME)
+        self.check_config(MONGO_WRITER_PARAMETER_COLLECTION_NAME, collection_name)
+        self.parameter[MONGO_WRITER_PARAMETER_COLLECTION_NAME] = collection_name
+        upsert_info_str = self.config_parser.get(self.plugin_type, MONGO_WRITER_PARAMETER_WRITE_MODE)
+        upsert_info = {}
+        if upsert_info_str:
+            for item in upsert_info_str.split(','):
+                k, v = item.split(':')
+                upsert_info[k] = v
+        self.parameter[MONGO_WRITER_PARAMETER_WRITE_MODE] = upsert_info
+
+    @staticmethod
+    def generate_definition(mongo_ds_name: str, mongo_database: str, mongo_collection: str,
+                            column_names: List[str], column_types: Dict[str, str], pk_fields: List[str]) -> str:
+        column = []
+        column_type = []
+        column_format = []
+        for col_name in column_names:
+            column.append(col_name)
+            if column_types.__contains__(col_name):
+                curr_type = column_types.get(col_name)
+                curr_type_upper = curr_type.upper()
+                if curr_type_upper != 'STRING':
+                    column_type.append(f'{col_name}:{curr_type_upper}')
+                    if curr_type_upper == 'DATE':
+                        column_format.append(f'{col_name}##yyyy-MM-dd HH:mm:ss')
+        write_mode = f'isReplace:true,replaceKey:{"_".join(pk_fields)}'
+        definition = [
+            '[writer]',
+            'dataSource = %s' % mongo_ds_name,
+            f'dbName = {mongo_database}',
+            f'collectionName = {mongo_collection}',
+            f'column = {",".join(column)}',
+            f'columnType = {",".join(column_type)}',
+            f'columnFormat = {",".join(column_format)}',
+            f'writeMode = {write_mode}'
+        ]
+        return '\n'.join(definition)

+ 60 - 0
dw_base/datax/plugins/writer/mysql_writer.py

@@ -0,0 +1,60 @@
+# -*- coding:utf-8 -*-
+
+import re
+from configparser import ConfigParser
+
+from dw_base.datax.datax_constants import *
+from dw_base.datax.plugins.writer.writer import Writer
+
+# mysql writer
+MYSQL_WRITER_NAME = 'mysqlwriter'
+MYSQL_WRITER_PARAMETER_BATCH_SIZE = 'batchSize'
+MYSQL_WRITER_PARAMETER_CONNECTION = 'connection'
+MYSQL_WRITER_PARAMETER_COLUMN = 'column'
+MYSQL_WRITER_PARAMETER_DATABASE = 'database'
+MYSQL_WRITER_PARAMETER_POST_SQL = 'postSql'
+MYSQL_WRITER_PARAMETER_PRE_SQL = 'preSql'
+MYSQL_WRITER_PARAMETER_TABLE = 'table'
+MYSQL_WRITER_PARAMETER_WRITE_MODE = 'writeMode'
+
+
+class MySQLWriter(Writer):
+
+    def __init__(self, base_dir: str, config_parser: ConfigParser, start_time: str = None, stop_time: str = None):
+        super(MySQLWriter, self).__init__(base_dir, config_parser, start_time, stop_time)
+        self.plugin_name = MYSQL_WRITER_NAME
+
+    def load_others(self):
+        database = self.config_parser.get(self.plugin_type, MYSQL_WRITER_PARAMETER_DATABASE)
+        self.check_config(MYSQL_WRITER_PARAMETER_DATABASE, database)
+        table = self.config_parser.get(self.plugin_type, MYSQL_WRITER_PARAMETER_TABLE)
+        self.check_config(MYSQL_WRITER_PARAMETER_TABLE, table)
+        jdbc_url: str = self.parameter[DS_MYSQL_JDBC_URL]
+        matcher = re.search('jdbc:mysql://(.+?)/(.+)', jdbc_url)
+        pre_sql = self.config_parser.get(self.plugin_type, MYSQL_WRITER_PARAMETER_PRE_SQL)
+        if matcher:
+            jdbc_url = jdbc_url.replace(matcher.group(2), database)
+        elif jdbc_url.endswith('/'):
+            jdbc_url = f'{jdbc_url}{database}'
+        else:
+            jdbc_url = f'{jdbc_url}/{database}'
+        if pre_sql.__contains__('${dt}'):
+            pre_sql = pre_sql.replace('${dt}', self.start_date)
+        connection = {
+            DS_MYSQL_JDBC_URL: f'{jdbc_url}?useSSL=false',
+            MYSQL_WRITER_PARAMETER_TABLE: table.split(',')
+        }
+        self.parameter[MYSQL_WRITER_PARAMETER_CONNECTION] = [connection]
+        del self.parameter[DS_MYSQL_JDBC_URL]
+        self.parameter[MYSQL_WRITER_PARAMETER_POST_SQL] = \
+            self.config_parser.get(self.plugin_type, MYSQL_WRITER_PARAMETER_POST_SQL).split(';') or []
+        self.parameter[MYSQL_WRITER_PARAMETER_PRE_SQL] = \
+            pre_sql.split(';') or []
+        self.parameter[MYSQL_WRITER_PARAMETER_WRITE_MODE] = \
+            self.config_parser.get(self.plugin_type, MYSQL_WRITER_PARAMETER_WRITE_MODE) or 'insert'
+        self.parameter[MYSQL_WRITER_PARAMETER_BATCH_SIZE] = \
+            self.config_parser.get(self.plugin_type, MYSQL_WRITER_PARAMETER_BATCH_SIZE) or '1024'
+
+    def load_column(self):
+        columns = self.config_parser.get(self.plugin_type, MYSQL_WRITER_PARAMETER_COLUMN).split(',')
+        self.parameter[MYSQL_WRITER_PARAMETER_COLUMN] = columns

+ 57 - 0
dw_base/datax/plugins/writer/postgresql_writer.py

@@ -0,0 +1,57 @@
+# -*- coding:utf-8 -*-
+
+import re
+from configparser import ConfigParser
+
+from dw_base.datax.datax_constants import *
+from dw_base.datax.plugins.writer.writer import Writer
+
+# postgresql writer
+POSTGRE_SQL_WRITER_NAME = 'postgresqlwriter'
+POSTGRE_SQL_WRITER_PARAMETER_BATCH_SIZE = 'batchSize'
+POSTGRE_SQL_WRITER_PARAMETER_CONNECTION = 'connection'
+POSTGRE_SQL_WRITER_PARAMETER_COLUMN = 'column'
+POSTGRE_SQL_WRITER_PARAMETER_DATABASE = 'database'
+POSTGRE_SQL_WRITER_PARAMETER_POST_SQL = 'postSql'
+POSTGRE_SQL_WRITER_PARAMETER_PRE_SQL = 'preSql'
+POSTGRE_SQL_WRITER_PARAMETER_TABLE = 'table'
+POSTGRE_SQL_WRITER_PARAMETER_WRITE_MODE = 'writeMode'
+
+
+class PostgreSQLWriter(Writer):
+    def __init__(self, base_dir: str, config_parser: ConfigParser, start_time: str = None, stop_time: str = None):
+        super(PostgreSQLWriter, self).__init__(base_dir, config_parser, start_time, stop_time)
+        self.plugin_name = POSTGRE_SQL_WRITER_NAME
+
+    def load_others(self):
+        database = self.config_parser.get(self.plugin_type, POSTGRE_SQL_WRITER_PARAMETER_DATABASE)
+        self.check_config(POSTGRE_SQL_WRITER_PARAMETER_DATABASE, database)
+        table = self.config_parser.get(self.plugin_type, POSTGRE_SQL_WRITER_PARAMETER_TABLE)
+        self.check_config(POSTGRE_SQL_WRITER_PARAMETER_TABLE, table)
+        jdbc_url: str = self.parameter[DS_POSTGRE_SQL_JDBC_URL]
+        matcher = re.search('jdbc:postgresql://(.+?)/(.+)', jdbc_url)
+        if matcher:
+            if database:
+                jdbc_url = jdbc_url.replace(matcher.group(2), database)
+        elif jdbc_url.endswith('/'):
+            jdbc_url = f'{jdbc_url}{database}'
+        else:
+            jdbc_url = f'{jdbc_url}/{database}'
+        connection = {
+            DS_POSTGRE_SQL_JDBC_URL: jdbc_url,
+            POSTGRE_SQL_WRITER_PARAMETER_TABLE: table.split(',')
+        }
+        self.parameter[POSTGRE_SQL_WRITER_PARAMETER_CONNECTION] = [connection]
+        del self.parameter[DS_POSTGRE_SQL_JDBC_URL]
+        self.parameter[POSTGRE_SQL_WRITER_PARAMETER_POST_SQL] = \
+            self.config_parser.get(self.plugin_type, POSTGRE_SQL_WRITER_PARAMETER_POST_SQL).split(';') or []
+        self.parameter[POSTGRE_SQL_WRITER_PARAMETER_PRE_SQL] = \
+            self.config_parser.get(self.plugin_type, POSTGRE_SQL_WRITER_PARAMETER_PRE_SQL).split(';') or []
+        self.parameter[POSTGRE_SQL_WRITER_PARAMETER_WRITE_MODE] = \
+            self.config_parser.get(self.plugin_type, POSTGRE_SQL_WRITER_PARAMETER_WRITE_MODE) or 'insert'
+        self.parameter[POSTGRE_SQL_WRITER_PARAMETER_BATCH_SIZE] = \
+            self.config_parser.get(self.plugin_type, POSTGRE_SQL_WRITER_PARAMETER_BATCH_SIZE) or '1024'
+
+    def load_column(self):
+        columns = self.config_parser.get(self.plugin_type, POSTGRE_SQL_WRITER_PARAMETER_COLUMN).split(',')
+        self.parameter[POSTGRE_SQL_WRITER_PARAMETER_COLUMN] = columns

+ 14 - 0
dw_base/datax/plugins/writer/writer.py

@@ -0,0 +1,14 @@
+# -*- coding:utf-8 -*-
+
+from configparser import ConfigParser
+
+from dw_base.datax.plugins.plugin import Plugin
+
+
+class Writer(Plugin):
+    def __init__(self, base_dir: str, config_parser: ConfigParser, start_date: str = None, stop_date=None):
+        super(Writer, self).__init__(base_dir, config_parser, start_date, stop_date)
+        self.plugin_type = 'writer'
+
+    def load_others(self):
+        raise NotImplementedError('please implement this method in a specified writer.')

+ 3 - 0
dw_base/ds/__init__.py

@@ -0,0 +1,3 @@
+#!/usr/bin/env /usr/bin/python3
+# -*- coding:utf-8 -*-
+

+ 19 - 0
dw_base/ds/config/base_config.yaml

@@ -0,0 +1,19 @@
+base_url: http://xxxx:12345/dolphinscheduler
+project_code:
+request_params:
+  processDefinitionCode:
+  scheduleTime:
+  failureStrategy: END
+  taskDependType: TASK_POST
+  execType: START_PROCESS
+  warningType: NONE
+  runMode: RUN_MODE_SERIAL
+  processInstancePriority: MEDIUM
+  workerGroup: cdh
+  tenantCode: alvis
+  startParams:
+  dryRun: 0
+  testFlag: 0
+  complementDependentMode: OFF_MODE
+  allLevelDependent: false
+  executionOrder: DESC_ORDER

+ 9 - 0
dw_base/ds/config/process_code.yaml

@@ -0,0 +1,9 @@
+project_code:
+  customs-data-mix: 15867179893120
+
+process_code:
+  customs-data-mix:
+    数据融合his_dataX: 15876817825536
+    数据融合fix_dataX: 15876835811200
+    数据融合del_dataX: 15876840305666
+    mix_fill_fix: 130549782670528

+ 76 - 0
dw_base/ds/ds_start_workflow.py

@@ -0,0 +1,76 @@
+"""
+start_workflow(project_name, process_name, start_params):
+    project_name: 项目名称
+    process_name: 工作流名称
+    start_params: 工作流参数(dict)
+"""
+
+import json
+import requests
+import yaml
+import re
+import os
+import logging
+
+logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
+
+abspath = os.path.abspath(__file__)
+root_path = re.sub(r"tendata-warehouse.*", "tendata-warehouse/", abspath)
+config_path = os.getenv("CONFIG_PATH", "dw_base/ds/config/base_config.yaml")
+process_code_path = os.getenv("PROCESS_CODE_PATH", "dw_base/ds/config/process_code.yaml")
+
+
+def load_yaml_config(path):
+    try:
+        with open(path, 'r') as file:
+            config = yaml.safe_load(file)
+        return config
+    except FileNotFoundError:
+        logging.error(f"配置文件 {path} 未找到")
+        return {}
+    except Exception as e:
+        logging.error(f"读取配置文件时发生错误: {e}")
+        return {}
+
+
+def init_params(config):
+    params: dict = config.get("request_params")
+    for key in params.keys():
+        if params[key] is None:
+            params[key] = ""
+        else:
+            params[key] = str(params.get(key))
+    return params
+
+
+def send_request(url, headers, params):
+    if params is None:
+        return
+    try:
+        result = requests.post(url=url, headers=headers, params=params)
+        result.raise_for_status()
+        logging.info(result.json())
+    except requests.exceptions.RequestException as e:
+        logging.error(f"请求失败: {e}")
+
+
+def get_request_base(project_name, process_name, token):
+    base_config: dict = load_yaml_config(root_path + config_path)
+    base_url = base_config.get("base_url")
+    headers = {
+        "token": token
+    }
+    params = init_params(base_config)
+    process_code_config: dict = load_yaml_config(root_path + process_code_path)
+    project_code = process_code_config.get("project_code").get(project_name)
+    url = f"{base_url}/projects/{project_code}/executors/start-process-instance"
+    process_code = process_code_config.get("process_code").get(project_name).get(process_name)
+    params["project_code"] = str(project_code)
+    params["processDefinitionCode"] = str(process_code)
+    return url, headers, params
+
+
+def start_workflow(project_name, process_name, start_params, token):
+    url, headers, params = get_request_base(project_name, process_name, token)
+    params["startParams"] = json.dumps(start_params)
+    send_request(url, headers, params)

+ 3 - 0
dw_base/elasticsearch/__init__.py

@@ -0,0 +1,3 @@
+#!/usr/bin/env /usr/bin/python3
+# -*- coding:utf-8 -*-
+

+ 3 - 0
dw_base/flink/__init__.py

@@ -0,0 +1,3 @@
+#!/usr/bin/env /usr/bin/python3
+# -*- coding:utf-8 -*-
+

+ 3 - 0
dw_base/hive/__init__.py

@@ -0,0 +1,3 @@
+#!/usr/bin/env /usr/bin/python3
+# -*- coding:utf-8 -*-
+

+ 30 - 0
dw_base/hive/hive_constants.py

@@ -0,0 +1,30 @@
+#!/usr/bin/env /usr/bin/python3
+# -*- coding:utf-8 -*-
+COLUMN_NAME_COMMENT_DICT = {
+    'valid': '是否校验通过, bool类型',
+    'validate_desc': '校验结果详情',
+    'crawler_time': '抓取时间',
+    'create_date': '创建日期',
+    'update_date': '更新日期',
+    'record_status': '记录状态',
+    'pid': 'PID',
+    'company_name': '企业名称',
+    'db_name': '来源站(拼音)',
+    'source_name': '来源站(中文)',
+    'web_name': '来源站',
+    'url': '可用外链链接',
+    'url_desc': '下载数据链接',
+    'json_cache_id': 'json页面OSS id',
+    'html_cache_id': 'HTML页面OSS id',
+    'use_status': '删除标记',
+    'is_final': '能否被爬虫更新',
+    'id': 'pg库自增主键'
+}
+COLUMN_NAME_TYPE_DICT = {
+    'valid': 'BOOLEAN',
+    'crawler_time': 'BIGINT',
+    'create_date': 'BIGINT',
+    'update_date': 'BIGINT',
+    'use_status': 'INT',
+    'id': 'BIGINT'
+}

+ 141 - 0
dw_base/hive/hive_utils.py

@@ -0,0 +1,141 @@
+# -*- coding:utf-8 -*-
+
+from typing import List, Dict
+
+from dw_base.hive.hive_constants import COLUMN_NAME_TYPE_DICT, COLUMN_NAME_COMMENT_DICT
+
+
+def get_hive_database_name(project: str, layer: str, env: str) -> str:
+    """
+    获取Hive数据库名称
+    Args:
+        project: 所属项目
+        layer: 所属层次
+        env: 所属环境
+    Returns: Hive数据库名称
+    """
+    if project and project != '':
+        if layer and layer != '':
+            database = f'{project}_{layer}'
+        else:
+            database = f'{project}_ods'
+    elif layer and layer != '':
+        database = layer
+    else:
+        database = 'tmp'
+    if database != 'tmp' and env and env != '':
+        database = f'{database}_{env}'
+    return database
+
+
+def get_hive_table_prefix(project: str, layer: str, version: str) -> str:
+    """
+    获取表名前缀
+    Args:
+        project: 所属项目
+        layer: 所属层次
+        version: 所属版本
+    Returns: 表名前缀
+    """
+    if layer and layer != '':
+        if project and project != '':
+            prefix = f'{layer}_{project}'
+        else:
+            prefix = layer
+        if version and version != '':
+            prefix = f'{prefix}_{version}'
+    elif project and project != '':
+        prefix = project
+    else:
+        prefix = 'tmp'
+    return prefix
+
+
+def get_hive_create_table_ddl(database: str,
+                              table: str,
+                              columns: List[str],
+                              columns_with_types: Dict,
+                              comment: str = '',
+                              is_external: bool = False,
+                              is_partitioned: bool = False):
+    """
+    生成Hive建表语句
+    Args:
+        database: 数据库名称
+        table: 表名称
+        columns: 字段
+        comment: 表注释
+        is_external: 是否是外部表
+        is_partitioned: 是否是分区表
+    Returns:
+    """
+    ddl = "DROP TABLE IF EXISTS {0};\nCREATE {1}TABLE IF NOT EXISTS {2}\n(\n{3}\n)\n\tCOMMENT '{4}'\n{5}\tSTORED AS ORC\n;"
+    if is_external:
+        argument1 = 'EXTERNAL '
+    else:
+        argument1 = ''
+    if database is None:
+        table_with_database = table
+    else:
+        table_with_database = f'{database}.{table}'
+    max_column_length = max(map(len, columns))
+    column_defs = []
+    for col in columns:
+        padded_col = col.ljust(max_column_length, ' ')
+        if columns_with_types.__contains__(col):
+            col_type = columns_with_types.get(col)
+        else:
+            col_type = COLUMN_NAME_TYPE_DICT.get(col, 'STRING')
+        col_comment = COLUMN_NAME_COMMENT_DICT.get(col, '')
+        column_defs.append(f"    {padded_col} {col_type} COMMENT '{col_comment}'")
+    argument3 = ',\n'.join(column_defs)
+    if is_partitioned:
+        argument5 = '\tPARTITIONED BY (dt STRING)\n'
+    else:
+        argument5 = ''
+    return ddl.format(table_with_database, argument1, table_with_database, argument3, comment, argument5)
+
+
+def get_hive_create_table_ddl_sop(database: str,
+                                  table: str,
+                                  columns: List[str],
+                                  columns_with_types: Dict,
+                                  comment: str = '',
+                                  is_external: bool = False,
+                                  is_partitioned: bool = False):
+    """
+    生成Hive建表语句
+    Args:
+        database: 数据库名称
+        table: 表名称
+        columns: 字段
+        comment: 表注释
+        is_external: 是否是外部表
+        is_partitioned: 是否是分区表
+    Returns:
+    """
+    ddl = "DROP TABLE IF EXISTS {0};\nCREATE {1}TABLE IF NOT EXISTS {2}\n(\n{3}\n)\n    COMMENT '{4}'\n{5}    STORED AS ORC\n;"
+    if is_external:
+        argument1 = 'EXTERNAL '
+    else:
+        argument1 = ''
+    if database is None:
+        table_with_database = table
+    else:
+        table_with_database = f'{database}.{table}'
+    max_column_length = max(map(len, columns))
+    column_defs = []
+    for col in columns:
+        padded_col = col.ljust(max_column_length, ' ')
+        if columns_with_types.__contains__(col):
+            col_type = columns_with_types.get(col)
+        else:
+            col_type = COLUMN_NAME_TYPE_DICT.get(col, 'STRING')
+        col_comment = COLUMN_NAME_COMMENT_DICT.get(col, '')
+        column_defs.append(f"    {padded_col} {col_type} COMMENT '{col_comment}'")
+    argument3 = ',\n'.join(column_defs)
+    if is_partitioned:
+        argument5 = '    PARTITIONED BY (dt STRING)\n'
+    else:
+        argument5 = ''
+    return ddl.format(table_with_database, argument1, table_with_database, argument3, comment, argument5)

+ 3 - 0
dw_base/ml/__init__.py

@@ -0,0 +1,3 @@
+#!/usr/bin/env /usr/bin/python3
+# -*- coding:utf-8 -*-
+

+ 3 - 0
dw_base/oss/__init__.py

@@ -0,0 +1,3 @@
+#!/usr/bin/env /usr/bin/python3
+# -*- coding:utf-8 -*-
+

+ 235 - 0
dw_base/oss/oss2_util.py

@@ -0,0 +1,235 @@
+import os
+
+import oss2
+
+from dw_base.utils.file_utils import get_abs_path
+
+
+class Readable:
+    def read(self):
+        pass
+
+
+class OssClient:
+    """
+    OSS术语
+    English                  | 中文
+    Bucket                   | 存储空间
+    Object                   | 对象或者文件
+    Endpoint                 | OSS 访问域名
+    Region                   | 地域或者数据中心
+    AccessKey                | AccessKeyId 和 AccessKeySecret 的统称,访问密钥
+    Put Object               | 简单上传
+    Post Object              | 表单上传
+    Multipart Upload         | 分片上传
+    Append Object            | 追加上传
+    Get Object               | 简单下载
+                             | 回调
+    Object Meta              | 文件元信息。用来描述文件信息,例如长度,类型等
+    Data                     | 文件数据
+    Key                      | 文件名
+    ACL (Access Control List)| 存储空间或者文件的权限
+    """
+
+    def __init__(self, access_key_id, access_key_secret, endpoint, bucket_name=None):
+        """
+        提供操作OSS各种方法
+        :param str access_key_id:
+        :param str access_key_secret:
+        :param str endpoint:
+        :rtype: OssClient
+        """
+        self._access_key_id = os.getenv('OSS_TEST_ACCESS_KEY_ID', access_key_id)
+        self._access_key_secret = os.getenv('OSS_TEST_ACCESS_KEY_SECRET', access_key_secret)
+        self._endpoint = os.getenv('OSS_TEST_ENDPOINT', endpoint)
+        self._auth = oss2.Auth(self._access_key_id, self._access_key_secret)
+        self._bucket_name = bucket_name
+        if bucket_name:
+            self._bucket = oss2.Bucket(self._auth, self._endpoint, bucket_name)
+        # 创建一个Service对象
+        self._service = oss2.Service(self._auth, self._endpoint)
+
+    def list_buckets(self, prefix='', marker='', max_keys=100, params=None):
+        """
+        根据前缀罗列用户的Bucket
+        :param str prefix: 只罗列Bucket名为该前缀的Bucket,空串表示罗列所有的Bucket
+        :param str marker: 分页标志。首次调用传空串,后续使用返回值中的next_marker
+        :param int max_keys: 每次调用最多返回的Bucket数目
+        :param dict params: list操作参数,传入'tag-key','tag-value'对结果进行过滤
+        :return: 罗列的结果
+        :rtype: oss2.models.ListBucketsResult
+        """
+        list_buckets_result = self._service.list_buckets(prefix, marker, max_keys, params)
+        return list_buckets_result
+
+    def get_bucket(self, bucket_name=None):
+        """
+        获取bucket
+        :param str bucket_name: bucket名称
+        :return:
+        :rtype: oss2.Bucket
+        """
+        assert bucket_name or self._bucket_name, 'need bucket name since default bucket is not provided'
+        if bucket_name:
+            return oss2.Bucket(self._auth, self._endpoint, bucket_name)
+        return self._bucket
+
+    def get_bucket_info(self, bucket_name=None):
+        """
+        获取bucket相关信息,如创建时间,访问Endpoint,Owner与ACL等。
+        :param str bucket_name: bucket名称
+        :return:
+        :rtype: oss2.models.GetBucketInfoResult
+        """
+        return self.get_bucket(bucket_name).get_bucket_info()
+
+    def get_bucket_status(self, bucket_name=None):
+        """
+        查看Bucket的状态,目前包括bucket大小,bucket的object数量,bucket正在上传的Multipart Upload事件个数等。
+        :param str bucket_name: bucket名称
+        :return:
+        :rtype: oss2.models.GetBucketStatResult
+        """
+        return self.get_bucket(bucket_name).get_bucket_stat()
+
+    def set_bucket_lifecycle(self, bucket_name=None, lifecycle_rule=None):
+        """
+        设置bucket生命周期
+        例:
+        # 设置bucket生命周期, 有'中文/'前缀的对象在最后修改时间之后357天失效
+        rule = oss2.models.LifecycleRule('lc_for_chinese_prefix', '中文/', status=oss2.models.LifecycleRule.ENABLED,
+                                         expiration=oss2.models.LifecycleExpiration(days=357))
+        # 删除相对最后修改时间365天之后的parts
+        rule.abort_multipart_upload = oss2.models.AbortMultipartUpload(days=356)
+        # 对象最后修改时间超过180天后转为IA
+        rule.storage_transitions = [oss2.models.StorageTransition(days=180, storage_class=oss2.BUCKET_STORAGE_CLASS_IA)]
+        # 对象最后修改时间超过356天后转为ARCHIVE
+        rule.storage_transitions.append(oss2.models.StorageTransition(days=356,
+                                                                      storage_class=oss2.BUCKET_STORAGE_CLASS_ARCHIVE))
+        lifecycle = oss2.models.BucketLifecycle([rule])
+        :param str bucket_name: bucket名称
+        :param oss2.models.BucketLifecycle lifecycle_rule: 生命周期
+        :return:
+        :rtype: oss2.models.RequestResult
+        """
+        assert not lifecycle_rule
+        return self.get_bucket(bucket_name).put_bucket_lifecycle(lifecycle_rule)
+
+    def upload_object(self, object_name, data, bucket_name=None, headers=None, progress_callback=None):
+        """
+        上传一个普通文件
+        :param str bucket_name: bucket名称
+        :param str object_name: 要上传的对象名称
+        :param bytes or str or Readable data: 要上传的数据(字节数组、字符串或file-like object——即含有read方法的对象)
+        :param dict[str,any] or oss2.CaseInsensitiveDict headers:
+        :param function progress_callback: 进度回调函数
+        :return:
+        :rtype: oss2.models.PutObjectResult
+        """
+        bucket = self.get_bucket(bucket_name)
+        put_object_result = bucket.put_object(object_name, data, headers, progress_callback)
+        return put_object_result
+
+    def upload_object_from_file(self, filename, object_name=None, bucket_name=None, headers=None,
+                                progress_callback=None):
+        """
+        上传一个普通文件
+        例:
+        1.  upload('my-bucket','my-file1.txt','content of my-file1')
+        2.  upload('my-bucket','my-file2.txt',b'content of my-file2')
+        3.  with open(oss2.to_unicode('my-file3.txt'), 'rb') as f:
+                upload('my-file3.txt', f)
+        :param str bucket_name: bucket名称
+        :param str filename: 要上传的文件名称
+        :param str object_name: 上传到OSS后新的对象名称
+        :param dict[str,any] or oss2.CaseInsensitiveDict headers:
+        :param function progress_callback: 进度回调函数
+        :return:
+        :rtype: oss2.models.PutObjectResult
+        """
+        if not object_name:
+            object_name = os.path.basename(filename)
+        bucket = self.get_bucket(bucket_name)
+        put_object_result = bucket.put_object_from_file(object_name, filename, headers, progress_callback)
+        return put_object_result
+
+    def download_object(self, bucket_object_name, local_object_name=None, bucket_name=None):
+        """
+        下载一个文件到本地文件
+        :param str bucket_name: bucket名称
+        :param str bucket_object_name: bucket上的对象名称
+        :param str local_object_name: 下载到本地后保存的对象名称
+        :return:
+        :rtype: oss2.models.GetObjectResult
+        """
+        bucket = self.get_bucket(bucket_name)
+        if local_object_name and bucket_object_name != local_object_name:
+            # 下载并修改文件名
+            get_object_result = bucket.get_object_to_file(bucket_object_name, local_object_name)
+        else:
+            get_object_result = bucket.get_object_to_file(bucket_object_name, bucket_object_name)
+        return get_object_result
+
+    def delete_object(self, object_name, bucket_name=None):
+        """
+        删除单个对象
+        :param str bucket_name: bucket名称
+        :param str object_name: 对象名称
+        :return:
+        :rtype: oss2.models.RequestResult
+        """
+        bucket = self.get_bucket(bucket_name)
+        return bucket.delete_object(object_name)
+
+    def delete_objects(self, objects_name, bucket_name=None):
+        """
+        批量删除对象
+        :param str bucket_name: bucket名称
+        :param list[str] objects_name: 对象名称list
+        :return:
+        :rtype: oss2.models.BatchDeleteObjectsResult
+        """
+        bucket = self.get_bucket(bucket_name)
+        return bucket.batch_delete_objects(objects_name)
+
+    def get_object(self, object_name, bucket_name=None):
+        """
+        获取一个对象
+        :param str object_name: 对象名称
+        :param str bucket_name: bucket名称
+        :return:
+        :rtype: oss2.models.GetObjectResult
+        """
+        bucket = self.get_bucket(bucket_name)
+        return bucket.get_object(object_name)
+
+    def get_object_meta(self, object_name, bucket_name=None):
+        """
+        获取一个对象的详细信息
+        :param str object_name: 对象名称
+        :param str bucket_name: bucket名称
+        :return:
+        :rtype: oss2.models.GetObjectMetaResult
+        """
+        bucket = self.get_bucket(bucket_name)
+        return bucket.get_object_meta(object_name)
+
+    def list_objects(self, bucket_name=None):
+        bucket = self.get_bucket(bucket_name)
+        return bucket.list_objects()
+
+
+DEFAULT_ACCESS_KEY_ID = 'LTAI5t9oGbXWacakS4PJyQsR'
+DEFAULT_ACCESS_KEY_SECRET = 'BeuOP5zsavBtsR8fQ5QmrNyczdCW1Q'
+DEFAULT_ENDPOINT = 'oss-cn-qingdao-internal.aliyuncs.com'
+DEFAULT_BUCKET_NAME = 'skb-applogo'
+DEFAULT_DOWNLOAD_ADDRESS = f'https://skb-applogo.oss-cn-qingdao.aliyuncs.com'
+DEFAULT_OSS_CLIENT = OssClient(
+    DEFAULT_ACCESS_KEY_ID,
+    DEFAULT_ACCESS_KEY_SECRET,
+    DEFAULT_ENDPOINT,
+    DEFAULT_BUCKET_NAME
+)
+
+if __name__ == '__main__':
+    DEFAULT_OSS_CLIENT.upload_object_from_file(get_abs_path('lib/gzrt-0.8.tar.gz'))

+ 0 - 0
dw_base/scheduler/__init__.py


+ 186 - 0
dw_base/scheduler/country_count_dingtalk.py

@@ -0,0 +1,186 @@
+# 指标
+# 参数示例: -mgdb kazakhstan -dt 20240304
+import sys
+import re
+import os
+import requests
+import json
+
+abspath = os.path.abspath(__file__)
+root_path = re.sub(r"tendata-warehouse.*", "tendata-warehouse", abspath)
+sys.path.append(root_path)
+
+from dw_base.spark.spark_sql import SparkSQL
+from dw_base.utils.config_utils import parse_args
+
+
+def send_dingtalk_notification(msg):
+    headers = {"Content-Type": "application/json"}
+    data = {
+        "msgtype": "text",
+        "text": {"content": msg}
+    }
+    json_data = json.dumps(data)
+    url = f'http://m1.node.cdh/dingtalk/api/robot/send?access_token=72cbdfb0a30fa51defca1dcba1c7b68feaace79c08e69da8cf9a7ea321481b06'
+    # 下面的url用于测试
+    # url = f'http://m1.node.cdh/dingtalk/api/robot/send?access_token=89974c66ec5a33c67acd71c0544fe323dd76c5d7a6f0b92acd09175745b737a0'
+    response = requests.post(url=url, data=json_data, headers=headers)
+    response.raise_for_status()
+
+
+def main():
+    # 解析命令行参数
+    CONFIG, _ = parse_args(sys.argv[1:])
+    mgdb = CONFIG.get('mgdb')
+    dt = CONFIG.get('dt')
+
+    with SparkSQL() as spark:
+        country_im_colm = {
+            'russia': 'shrmc',
+            'india': 'jksmc',
+            'india_exp': 'jksmc',
+            'vietnam': 'jksmc',
+            'turkey': 'jksmc',
+            'kazakhstan': 'jksmc',
+            'mexico': 'jksmc',
+            'mexico_bol': 'jksmc'
+        }
+        country_ex_colm = {
+            'russia': 'fhrmc',
+            'india': 'cksmc',
+            'india_exp': 'cksmc',
+            'vietnam': 'cksmc',
+            'turkey': 'cksmc',
+            'kazakhstan': 'cksmc',
+            'mexico': 'cksmc',
+            'mexico_bol': 'cksmc'
+        }
+
+        sql_query1 = (f"select count(1) AS total_tid_count from ( select id "
+                      f"from (select jkstid as id "
+                      f"      from dwd.cts_{mgdb}_im "
+                      f"      where dt in ('19700101', '20240303') "
+                      f"      union all "
+                      f"      select ckstid as id "
+                      f"      from dwd.cts_{mgdb}_ex "
+                      f"      where dt in ('19700101', '20240303')) a "
+                      f"group by id)b ")
+        res = spark.query(sql_query1)[0].collect()
+        cnt1 = res[0]['total_tid_count']
+
+        sql_query2 = (f"select count(1) AS total_tid_count "
+                      f"from ( "
+                      f"select id "
+                      f"from (select id, count(1) "
+                      f"      from (select jkstid as id, {country_im_colm[mgdb]} as mc  "
+                      f"            from dwd.cts_{mgdb}_im "
+                      f"            where dt in ('19700101', '20240303') "
+                      f"            union all "
+                      f"            select ckstid as id, {country_ex_colm[mgdb]} as mc "
+                      f"            from dwd.cts_{mgdb}_ex "
+                      f"            where dt in ('19700101', '20240303')) a "
+                      f"      group by id, mc) b "
+                      f"group by id "
+                      f"having count(1) = 1)b ")
+        res = spark.query(sql_query2)[0].collect()
+        cnt2 = res[0]['total_tid_count']
+
+        sql_query3 = (f"select count(1) AS total_tid_count "
+                      f"from ( "
+                      f"select id "
+                      f"from (select id "
+                      f"      from (select jkstid as id, {country_im_colm[mgdb]} as mc "
+                      f"            from dwd.cts_{mgdb}_im "
+                      f"            where dt in ('19700101', '20240303') "
+                      f"            union all "
+                      f"            select ckstid as id, {country_ex_colm[mgdb]} as mc "
+                      f"            from dwd.cts_{mgdb}_ex "
+                      f"            where dt in ('19700101', '20240303')) a "
+                      f"      group by id, mc) b "
+                      f"group by id "
+                      f"having count(1) > 1)b ")
+        res = spark.query(sql_query3)[0].collect()
+        cnt3 = res[0]['total_tid_count']
+
+        sql_query4 = (f"select (select count(1) "
+                      f"        from dwd.cts_{mgdb}_im "
+                      f"        where dt in ('19700101', '20240303')) + "
+                      f"       (select count(1) "
+                      f"        from dwd.cts_{mgdb}_ex "
+                      f"        where dt in ('19700101', '20240303')) as total_tid_count ")
+        res = spark.query(sql_query4)[0].collect()
+        cnt4 = res[0]['total_tid_count']
+        sql_query5 = (f"select count(1) AS total_tid_count  "
+                      f"from (select jkstid as id "
+                      f"            from dwd.cts_{mgdb}_im "
+                      f"            where dt in ('19700101', '20240303') "
+                      f"            union all "
+                      f"            select ckstid as id "
+                      f"            from dwd.cts_{mgdb}_ex "
+                      f"            where dt in ('19700101', '20240303'))c "
+                      f"where id in (select id "
+                      f"                 from (select id "
+                      f"                       from (select jkstid as id, {country_im_colm[mgdb]} as mc "
+                      f"                             from dwd.cts_{mgdb}_im "
+                      f"                             where dt in ('19700101', '20240303') "
+                      f"                             union all "
+                      f"                             select ckstid as id, {country_ex_colm[mgdb]} as mc "
+                      f"                             from dwd.cts_{mgdb}_ex "
+                      f"                             where dt in ('19700101', '20240303')) a "
+                      f"                       group by id, mc) b "
+                      f"                 group by id "
+                      f"                 having count(1) = 1) ")
+        res = spark.query(sql_query5)[0].collect()
+        cnt5 = res[0]['total_tid_count']
+
+        sql_query6 = (f"select count(1) AS total_tid_count "
+                      f"from (select jkstid as id "
+                      f"            from dwd.cts_{mgdb}_im "
+                      f"            where dt in ('19700101', '20240303') "
+                      f"            union all "
+                      f"            select ckstid as id "
+                      f"            from dwd.cts_{mgdb}_ex "
+                      f"            where dt in ('19700101', '20240303'))c "
+                      f"where id in (select id "
+                      f"                 from (select id "
+                      f"                       from (select jkstid as id, {country_im_colm[mgdb]} as mc "
+                      f"                             from dwd.cts_{mgdb}_im "
+                      f"                             where dt in ('19700101', '20240303') "
+                      f"                             union all "
+                      f"                             select ckstid as id, {country_ex_colm[mgdb]} as mc "
+                      f"                             from dwd.cts_{mgdb}_ex "
+                      f"                             where dt in ('19700101', '20240303')) a "
+                      f"                       group by id, mc) b "
+                      f"                 group by id "
+                      f"                 having count(1) > 1) ")
+        res = spark.query(sql_query6)[0].collect()
+        cnt6 = res[0]['total_tid_count']
+        sql_query7 = (f"select count(1) AS total_tid_count "
+                      f"from (select jkstid as id "
+                      f"            from dwd.cts_{mgdb}_im "
+                      f"            where dt in ('19700101', '20240303') "
+                      f"            union all "
+                      f"            select ckstid as id "
+                      f"            from dwd.cts_{mgdb}_ex "
+                      f"            where dt in ('19700101', '20240303'))c "
+                      f"where id is null ")
+        res = spark.query(sql_query7)[0].collect()
+        cnt7 = res[0]['total_tid_count']
+
+        msg = (f"{mgdb}数据量指标 \n"
+               f"-----------------------------------\n"
+               f"{mgdb}进出口统计:\n\n"
+               f"总tid数量:\t\t\t{cnt1}\n"
+               f"一对一的tid数量:\t\t{cnt2}\n"
+               f"一对多的tid数量:\t\t{cnt3}\n\n"
+               f"详单总数据量:\t\t{cnt4}\n"
+               f"一对一的tid的详单数量:\t{cnt5}\n"
+               f"一对多的tid的详单数量:\t{cnt6}\n"
+               f"tid为空的详单数量:\t\t{cnt7}\n"
+               f"  \n"
+               )
+        send_dingtalk_notification(msg)
+
+
+if __name__ == '__main__':
+    main()

+ 240 - 0
dw_base/scheduler/dingtalk_mirror_monitor.py

@@ -0,0 +1,240 @@
+# 用于钉钉监控T+1任务是否需要重跑
+import sys
+import re
+import os
+import requests
+import json
+
+abspath = os.path.abspath(__file__)
+root_path = re.sub(r"tendata-warehouse.*", "tendata-warehouse", abspath)
+sys.path.append(root_path)
+from dw_base.spark.spark_sql import SparkSQL
+from dw_base.utils.log_utils import pretty_print
+from configparser import ConfigParser
+from datetime import time
+from pymongo import MongoClient
+from dw_base import *
+from dw_base.scheduler.polling_scheduler import get_mongo_client
+from dw_base.utils.config_utils import parse_args
+from dw_base.scheduler.mg2es.conf_reader import ConfReader
+from dw_base.scheduler.mg2es.es_operator import ESOperator
+from elasticsearch.exceptions import NotFoundError
+
+call_count = 0
+
+
+def check_call_count():
+    global call_count
+    if call_count == 0:
+        pretty_print(f'{NORM_CYN}{time.strftime("%Y-%m-%d %H:%M:%S", time.localtime())} '
+                     f'{NORM_MGT}向后传递参数: {NORM_GRN}is_run => 1 '
+                     f'{NORM_MGT} call_count =>{call_count}')
+        print('${setValue(is_run=%s)}' % '1')
+    else:
+        pretty_print(f'{NORM_CYN}{time.strftime("%Y-%m-%d %H:%M:%S", time.localtime())} '
+                     f'{NORM_MGT}向后传递参数: {NORM_GRN}is_run => 0 '
+                     f'{NORM_MGT} call_count =>{call_count}')
+        print('${setValue(is_run=%s)}' % '0')
+
+
+def send_dingtalk_notification(msg):
+    global call_count
+    call_count += 1
+    headers = {"Content-Type": "application/json"}
+    data = {
+        "msgtype": "text",
+        "text": {"content": msg}
+    }
+    json_data = json.dumps(data)
+    # 下面的url用于测试
+    url = 'http://m1.node.cdh/dingtalk/api/robot/send?access_token=a4a48ed82627149f3317ee86e249fd7d973f5bed40fcac55cc2e7ca8d9ae0c61'
+    response = requests.post(url=url, data=json_data, headers=headers)
+    response.raise_for_status()
+
+
+def send_dingtalk_notification_es(msg):
+    headers = {"Content-Type": "application/json"}
+    data = {
+        "msgtype": "text",
+        "text": {"content": msg}
+    }
+    json_data = json.dumps(data)
+    # 下面的url用于测试
+    url = 'http://m1.node.cdh/dingtalk/api/robot/send?access_token=a4a48ed82627149f3317ee86e249fd7d973f5bed40fcac55cc2e7ca8d9ae0c61'
+    response = requests.post(url=url, data=json_data, headers=headers)
+    response.raise_for_status()
+
+
+def get_mongo_client(conf_path):
+    config_parser = ConfigParser()
+    config_parser.read(root_path + conf_path)
+    url = config_parser.get('base', 'address')
+    return MongoClient(url)
+
+
+def get_count(client, mgdb, mgtbl):
+    db = client[mgdb]
+    collection = db[mgtbl]
+    return collection.count()
+
+
+def get_count_null(client, mgdb, mgtbl):
+    db = client[mgdb]
+    collection = db[mgtbl]
+    # 计数`date`字段不为null的文档
+    # return  collection.count_documents({'date': {'$ne': None}})
+    # 计数`date` 为null的文档
+    return collection.count_documents({'date': None})
+
+
+def get_old_count(mgdb, mgtbl):
+    client = get_mongo_client('/../datasource/mongo/mongo-cts-prod-old.ini')
+    result = get_count(client, mgdb, mgtbl)
+    pretty_print(f'{NORM_CYN}{time.strftime("%Y-%m-%d %H:%M:%S", time.localtime())} '
+                 f'{NORM_MGT} old source mongo: {NORM_GRN}{mgdb}.{mgtbl} '
+                 f'{NORM_MGT} old data count: {NORM_GRN}{result}')
+    return result
+
+
+def get_clu_count_null(mgdb, mgtbl):
+    client = get_mongo_client('/../datasource/mongo/mongo-cluster-cts-prod.ini')
+    result = get_count_null(client, mgdb, mgtbl)
+    pretty_print(f'{NORM_CYN}{time.strftime("%Y-%m-%d %H:%M:%S", time.localtime())} '
+                 f'{NORM_MGT} 集群 mongo: {NORM_GRN}{mgdb}.{mgtbl} '
+                 f'{NORM_MGT} 集群date字段为空 count: {NORM_GRN}{result}')
+    return result
+
+
+def get_dev_count_null(mgdb, mgtbl):
+    client = get_mongo_client('/../datasource/mongo/mongo-cts-dev-rw-200-test.ini')
+    result = get_count_null(client, mgdb, mgtbl)
+    pretty_print(f'{NORM_CYN}{time.strftime("%Y-%m-%d %H:%M:%S", time.localtime())} '
+                 f'{NORM_MGT} dev source mongo: {NORM_GRN}{mgdb}.{mgtbl} '
+                 f'{NORM_MGT} dev data count: {NORM_GRN}{result}')
+    return result
+
+
+def get_clu_count(mgdb, mgtbl):
+    client = get_mongo_client('/../datasource/mongo/mongo-cluster-cts-prod.ini')
+    result = get_count(client, mgdb, mgtbl)
+    pretty_print(f'{NORM_CYN}{time.strftime("%Y-%m-%d %H:%M:%S", time.localtime())} '
+                 f'{NORM_MGT} 大数据集群mongo sink mongo: {NORM_GRN}{mgdb}.{mgtbl} '
+                 f'{NORM_MGT} 大数据集群mongo data count: {NORM_GRN}{result}')
+    return result
+
+
+
+
+def get_diff_logic(spark,record,dt):
+    mgdb = record['mgdb']
+    catalog = record['catalog']
+    bigdata_count = record['cnt']
+    clu_cnt = get_clu_count(mgdb, catalog)
+
+    date_null_cnt = get_clu_count_null(mgdb, catalog)
+
+    # 两个mongo数据量对比
+    cnt_diff = clu_cnt - bigdata_count
+
+    # if cnt_diff != 0 or date_null_cnt != 0:
+    if  date_null_cnt != 0:
+        msg3 = (
+            f"\n"
+            f"--------------------------------\n"
+            f"镜像_mir 数据一致性警告\n"
+            f"--------------------------------\n"
+            f"在 {mgdb}_{catalog}  详细差异报告:\n\n"
+            f"\n"
+            f"--------------------------------\n"
+            f"计数对比:\n"
+            f"  大数据_镜像mongo 计数: {clu_cnt}\n"
+            f"  大数据平台 DWD 计数: {bigdata_count}\n"
+            f"  大数据_镜像mongo `date`字段为空 计数: {date_null_cnt}\n"
+            f"\n"
+            f"请检查原因 \n"
+            f"\n"
+            f"--------------------------------\n"
+        )
+        print(msg3)
+        # send_dingtalk_notification(msg3)
+
+    # 添加最终各个国家的统计数据量
+    statistical_time = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime())
+    sql_insert_cnt = f"""
+
+    insert into table task.cts_mirror_count 
+    select '{mgdb}','{catalog}',{bigdata_count},{clu_cnt},'{statistical_time}','{dt}'
+
+    """
+    spark.query(sql_insert_cnt)[0].collect()
+def main():
+    CONFIG, _ = parse_args(sys.argv[1:])
+    dt = CONFIG.get('dt')
+    ydt = CONFIG.get('ydt')
+    spark = SparkSQL()
+    spark._final_spark_config = {'hive.exec.dynamic.partition': 'true',
+                                 'hive.exec.dynamic.partition.mode': 'nonstrict',
+                                 'spark.yarn.queue': 'cts',
+                                 'spark.sql.crossJoin.enabled': 'true',
+                                 'spark.executor.memory': '8g',
+                                 'spark.executor.memoryOverhead': '2048',
+                                 'spark.driver.memory': '4g',
+                                 'spark.executor.instances': "12",
+                                 'spark.executor.cores': '4',
+                                 "spark.sql.hive.filesourcePartitionFileCacheSize":"536870912"
+                                 }
+    im_sql = (
+        f"select i.code3 as code3,code.english_name as country_name,concat(code.english_name,'_mir') as mgdb,cnt,'shipments_imports' as catalog"
+        f"  from"
+        f"( select   country_code as code3 ,count(1) as cnt from (select country_code from dwd.cts_mirror_country_im  where dt ='{ydt}') im "
+        f"group by country_code) i left join dim.cts_mirror_monitor code"
+        f" on i.code3 = code.code3 where code.english_name is not null")
+    ex_sql = (
+        f"select i.code3 as code3,code.english_name as country_name,concat(code.english_name,'_mir') as mgdb,cnt,'shipments_exports' as catalog "
+        f" from"
+        f"( select   country_code as code3 ,count(1) as cnt from (select country_code from dwd.cts_mirror_country_ex  where dt ='{ydt}') ex "
+        f"group by country_code) i left join dim.cts_mirror_monitor code"
+        f" on i.code3 = code.code3 where code.english_name is not null")
+
+    res_im = spark.query(im_sql)[0].collect()
+    res_ex = spark.query(ex_sql)[0].collect()
+
+
+    for record in res_im:
+        get_diff_logic(spark, record,dt)
+    for record in res_ex:
+        get_diff_logic(spark, record,dt)
+
+    sql_overwrite_cnt = f"""
+
+INSERT overwrite TABLE task.cts_mirror_count
+SELECT country,
+       catalog,
+       dwd_cnt,
+       mongo_cnt,
+       creat_time,
+       dt
+FROM
+  ( SELECT *,
+           row_number() over (partition BY country,catalog
+                              ORDER BY `creat_time` DESC) AS rk
+  FROM task.cts_mirror_count
+  WHERE dt ={dt}   ) tmp 
+where rk =1
+           """
+    spark.query(sql_overwrite_cnt)[0].collect()
+    check_call_count()
+
+
+if __name__ == '__main__':
+    main()
+
+# CREATE TABLE task.cts_mirror_count
+# (
+#     `country`    string COMMENT 'mgdb',
+#     `catalog`    string COMMENT '进出口类型',
+#     `cnt`        bigint comment '数据量',
+#     `creat_time` STRING COMMENT '统计时间'
+# )
+#     PARTITIONED BY ( `dt` string )
+#     TBLPROPERTIES ( 'COMMENT' = '同步到大数据平台的数据量统计');

+ 102 - 0
dw_base/scheduler/dingtalk_notifier.py

@@ -0,0 +1,102 @@
+# 调用钉钉机器人通知相关人员更新ES
+
+import sys
+import re
+import os
+
+
+
+abspath = os.path.abspath(__file__)
+root_path = re.sub(r"tendata-warehouse.*", "tendata-warehouse", abspath)
+sys.path.append(root_path)
+from dw_base.utils.log_utils import pretty_print
+from dw_base import *
+import requests
+import json
+import time
+from dw_base.scheduler.polling_scheduler import get_sink_count
+from dw_base.utils.config_utils import parse_args
+from dw_base.spark.spark_sql import SparkSQL
+import random
+def send_dingtalk_notification(msg):
+    headers = {"Content-Type": "application/json"}
+    data = {
+        "msgtype": "text",
+        "text": {"content": msg},
+        "at": {"atMobiles": ["13924570409"]}
+    }
+    json_data = json.dumps(data)
+    url = 'https://oapi.dingtalk.com/robot/send?access_token=bda512e1f980c8d126361afbae9d744e9885705ce6ed047395a1f6bc4114114d'
+    response = requests.post(url=url, data=json_data, headers=headers)
+    response.raise_for_status()
+
+
+
+if __name__ == '__main__':
+    CONFIG, _ = parse_args(sys.argv[1:])
+    start_date = CONFIG.get('start-date')
+    stop_date = CONFIG.get('stop-date')
+    mgdb = CONFIG.get('mgdb')
+    mgtbl = CONFIG.get('mgtbl')
+    batch_id = CONFIG.get('batch_id')
+    cdt =f"{time.strftime('%Y%m%d', time.localtime())}"
+    count = get_sink_count(mgdb, mgtbl, start_date, stop_date)
+
+    spark = SparkSQL()
+    spark._final_spark_config = {'hive.exec.dynamic.partition': 'true',
+                                 'hive.exec.dynamic.partition.mode': 'nonstrict',
+                                 'spark.yarn.queue': 'cts',
+                                 'spark.sql.crossJoin.enabled': 'true',
+                                 'spark.executor.memory': '6g',
+                                 'spark.executor.memoryOverhead': '2048',
+                                 'spark.driver.memory': '4g',
+                                 'spark.executor.instances': "15",
+                                 'spark.executor.cores': '2'
+                                 }
+    if count > 0 :
+        try:
+            # 定义延时时间列表
+            delay_times = [0,10,18,26,39,45,52,60,70,80,90,100]
+            delay = random.choice(delay_times)
+            print(f"随机延时时间为:{delay}秒")
+            time.sleep(delay)
+
+            sql = (f"select count(1) as cnt from task.cts_incr_updated_data_cnt  "
+                   f"where  dt = '{cdt}'")
+            res = spark.query(sql)[0].collect()
+            order_id= int(res[0].cnt +1)
+
+            sql_insert_cnt = f"""
+
+            insert into table task.cts_incr_updated_data_cnt 
+            select '{mgdb}','{mgtbl}',{count},'{time.strftime('%Y-%m-%d %H:%M:%S', time.localtime())}','{cdt}'
+
+            """
+            spark.query(sql_insert_cnt)[0].collect()
+            msg = (f"数据上新提醒 @13924570409\n"
+                   f"{time.strftime('%Y-%m-%d %H:%M:%S', time.localtime())}   ({order_id})\n"
+                   f"{mgdb}.{mgtbl} 今日新增数据量: {count} 已入库完毕,\n调用接口成功,正在刷es索引!"
+                   f"本批数据batch_id为: {batch_id} "
+                   )
+            pretty_print(f'{NORM_CYN}{time.strftime("%Y-%m-%d %H:%M:%S", time.localtime())} ({order_id})'
+                         f'{NORM_MGT}已发送通知: {NORM_GRN} {msg} ')
+            send_dingtalk_notification(msg)
+
+        except Exception as e:
+            print(f"发生错误: {e}")
+
+
+#
+#
+# CREATE TABLE task.`cts_incr_updated_data_cnt`
+# (
+#     `mgdb`         STRING COMMENT 'mgdb',
+#     `mgtbl`        STRING COMMENT 'mgtbl',
+#     `count`        int COMMENT '计数',
+#     `created_time` STRING COMMENT '统计时间'
+# )
+#     COMMENT 'cts_incr_updated_data_cnt'
+#     PARTITIONED BY (`dt` STRING)
+#     STORED AS ORC
+#     tblproperties ('orc.compress' = 'ZLIB')
+

+ 370 - 0
dw_base/scheduler/dingtalk_task_monitor.py

@@ -0,0 +1,370 @@
+# 用于钉钉监控T+1任务是否需要重跑
+
+import sys
+import re
+import os
+import requests
+import json
+
+abspath = os.path.abspath(__file__)
+root_path = re.sub(r"tendata-warehouse.*", "tendata-warehouse", abspath)
+sys.path.append(root_path)
+from dw_base.spark.spark_sql import SparkSQL
+from dw_base.utils.log_utils import pretty_print
+from configparser import ConfigParser
+from datetime import time,datetime
+from pymongo import MongoClient
+from dw_base import *
+from dw_base.scheduler.polling_scheduler import get_mongo_client
+from dw_base.utils.config_utils import parse_args
+from dw_base.scheduler.mg2es.conf_reader import ConfReader
+from dw_base.scheduler.mg2es.es_operator import ESOperator
+from elasticsearch.exceptions import NotFoundError
+
+call_count = 0
+
+
+def check_call_count():
+    global call_count
+    if call_count == 0:
+        pretty_print(f'{NORM_CYN}{time.strftime("%Y-%m-%d %H:%M:%S", time.localtime())} '
+                     f'{NORM_MGT}向后传递参数: {NORM_GRN}is_run => 1 '
+                     f'{NORM_MGT} call_count =>{call_count}')
+        print('${setValue(is_run=%s)}' % '1')
+    else:
+        pretty_print(f'{NORM_CYN}{time.strftime("%Y-%m-%d %H:%M:%S", time.localtime())} '
+                     f'{NORM_MGT}向后传递参数: {NORM_GRN}is_run => 0 '
+                     f'{NORM_MGT} call_count =>{call_count}')
+        print('${setValue(is_run=%s)}' % '0')
+
+
+def send_dingtalk_notification(msg):
+    global call_count
+    call_count += 1
+    headers = {"Content-Type": "application/json"}
+    data = {
+        "msgtype": "text",
+        "text": {"content": msg}
+    }
+    json_data = json.dumps(data)
+    # 下面的url用于测试
+    url = 'http://m1.node.cdh/dingtalk/api/robot/send?access_token=a4a48ed82627149f3317ee86e249fd7d973f5bed40fcac55cc2e7ca8d9ae0c61'
+    response = requests.post(url=url, data=json_data, headers=headers)
+    response.raise_for_status()
+
+def send_dingtalk_notification_es(msg):
+    headers = {"Content-Type": "application/json"}
+    data = {
+        "msgtype": "text",
+        "text": {"content": msg}
+    }
+    json_data = json.dumps(data)
+    # 下面的url用于测试
+    url = 'http://m1.node.cdh/dingtalk/api/robot/send?access_token=a4a48ed82627149f3317ee86e249fd7d973f5bed40fcac55cc2e7ca8d9ae0c61'
+    response = requests.post(url=url, data=json_data, headers=headers)
+    response.raise_for_status()
+
+
+def get_mongo_client(conf_path):
+    config_parser = ConfigParser()
+    config_parser.read(root_path + conf_path)
+    url = config_parser.get('base', 'address')
+    return MongoClient(url)
+
+
+def get_count(client, mgdb, mgtbl):
+    db = client[mgdb]
+    collection = db[mgtbl]
+    return collection.count()
+def get_count_null(client, mgdb, mgtbl):
+    db = client[mgdb]
+    collection = db[mgtbl]
+    # 计数`date`字段不为null的文档
+    # return  collection.count_documents({'date': {'$ne': None}})
+    # 计数`date` 为null的文档
+    return  collection.count_documents({'date': None})
+
+def get_count_range_date(mgdb, mgtbl, target_date):
+    """
+    统计 date 字段值小于目标日期的文档总数
+    Args:
+        client: MongoDB客户端实例
+        mgdb: 数据库名称
+        mgtbl: 集合名称
+        target_date_str: 目标日期字符串 (格式: "YYYYMMDD")
+
+    Returns:
+        int: 符合条件的文档数量
+    """
+    client = get_mongo_client('/../datasource/mongo/mongo-cluster-cts-prod.ini')
+    db = client[mgdb]
+    collection = db[mgtbl]
+
+    # 将输入的字符串转换为 datetime 对象
+    target_date = datetime.strptime(target_date, "%Y%m%d").replace(
+        tzinfo=None  # 如果数据库时间不带时区,可以移除此行
+    )
+
+    count = collection.count_documents({'date': {'$lt': target_date}})
+    return count
+
+
+def get_old_count(mgdb, mgtbl):
+    client = get_mongo_client('/../datasource/mongo/mongo-cts-prod-old.ini')
+    result = get_count(client, mgdb, mgtbl)
+    pretty_print(f'{NORM_CYN}{time.strftime("%Y-%m-%d %H:%M:%S", time.localtime())} '
+                 f'{NORM_MGT} old source mongo: {NORM_GRN}{mgdb}.{mgtbl} '
+                 f'{NORM_MGT} old data count: {NORM_GRN}{result}')
+    return result
+def get_clu_count_null(mgdb, mgtbl):
+    client = get_mongo_client('/../datasource/mongo/mongo-cluster-cts-prod.ini')
+    result = get_count_null(client, mgdb, mgtbl)
+    pretty_print(f'{NORM_CYN}{time.strftime("%Y-%m-%d %H:%M:%S", time.localtime())} '
+                 f'{NORM_MGT} old source mongo: {NORM_GRN}{mgdb}.{mgtbl} '
+                 f'{NORM_MGT} old data count: {NORM_GRN}{result}')
+    return result
+def get_dev_count_null(mgdb, mgtbl):
+    client = get_mongo_client('/../datasource/mongo/mongo-cts-dev-rw-200-test.ini')
+    result = get_count_null(client, mgdb, mgtbl)
+    pretty_print(f'{NORM_CYN}{time.strftime("%Y-%m-%d %H:%M:%S", time.localtime())} '
+                 f'{NORM_MGT} old source mongo: {NORM_GRN}{mgdb}.{mgtbl} '
+                 f'{NORM_MGT} old data count: {NORM_GRN}{result}')
+    return result
+
+
+def get_clu_count(mgdb, mgtbl):
+    client = get_mongo_client('/../datasource/mongo/mongo-cluster-cts-prod.ini')
+    result = get_count(client, mgdb, mgtbl)
+    pretty_print(f'{NORM_CYN}{time.strftime("%Y-%m-%d %H:%M:%S", time.localtime())} '
+                 f'{NORM_MGT} 大数据集群mongo sink mongo: {NORM_GRN}{mgdb}.{mgtbl} '
+                 f'{NORM_MGT} 大数据集群mongo data count: {NORM_GRN}{result}')
+    return result
+
+
+def get_bigdata_count(mgdb, mgtbl, dt, spark,cdt):
+    sql = (f"select count(1) cnt "
+           f"from dwd.cts_{mgdb}_{mgtbl} "
+           f" where dt in ('19700101', {dt},{cdt}) ")
+    res = spark.query(sql)[0].collect()
+    pretty_print(f'{NORM_CYN}{time.strftime("%Y-%m-%d %H:%M:%S", time.localtime())} '
+                 f'{NORM_MGT} 大数据dwd表名: {NORM_GRN}dwd.cts_{mgdb}_{mgtbl} '
+                 f'{NORM_MGT} 大数据dwd 1970+昨日分区+当日分区 count: {NORM_GRN}{res[0].cnt}')
+    return res[0].cnt
+
+
+def get_bigdata_global_bol_count(catalog, dt, spark):
+    sql = (f"""
+    select sum(cnt) cnt from (select count(1) cnt from dwd.`cts_north_america_bol_{catalog}`   where dt in ('19700101', {dt}) 
+union all select count(1) from dwd.`cts_central_america_bol_{catalog}`  where dt in ('19700101', {dt}) 
+union all select count(1) from dwd.`cts_south_america_bol_{catalog}`    where dt in ('19700101', {dt}) 
+union all select count(1) from dwd.`cts_asia_bol_{catalog}`             where dt in ('19700101', {dt}) 
+union all select count(1) from dwd.`cts_middle_east_bol_{catalog}`      where dt in ('19700101', {dt}) 
+union all select count(1) from dwd.`cts_europe_bol_{catalog}`           where dt in ('19700101', {dt}) 
+union all select count(1) from dwd.`cts_africa_bol_{catalog}`           where dt in ('19700101', {dt}) 
+union all select count(1) from dwd.`cts_oceania_bol_{catalog}`          where dt in ('19700101', {dt})                 ) a""")
+    res = spark.query(sql)[0].collect()
+    pretty_print(f'{NORM_CYN}{time.strftime("%Y-%m-%d %H:%M:%S", time.localtime())} '
+                 f'{NORM_MGT} 大数据dwd表名: global_bol 1拆8 '
+                 f'{NORM_MGT} 大数据dwd 1970+昨日分区count: {NORM_GRN}{res[0].cnt}')
+    return res[0].cnt
+
+
+def get_year_count(mgdb, catalog, dt, spark):
+    if mgdb != "global_bol":
+        sql = (f"select from_unixtime(cast(`date`/1000 as int)- 8 * 60 * 60, 'yyyy') as year,count(1) hive_cnt "
+               f"from dwd.cts_{mgdb}_{catalog} "
+               f" where dt in ('19700101', {dt}) "
+               f" group by from_unixtime(cast(`date`/1000 as int)- 8 * 60 * 60, 'yyyy')"
+               f" order by from_unixtime(cast(`date`/1000 as int)- 8 * 60 * 60, 'yyyy')")
+        res = spark.query(sql)[0].collect()
+        hive_year_cnt_dict = {}
+        es_year_cnt_dict = {}
+
+        host, port = ConfReader().get_es_conf()
+        es_operator = ESOperator(host, port)
+        for record in res:
+            year = record['year']
+            hive_cnt = record['hive_cnt']
+            hive_year_cnt_dict[year] = hive_cnt
+            # index_name = 'customs_' + str(catalogs[catalog]) + '_' + mgdb + '-' + year
+            index_name = str(catalog) + '_' + mgdb + '-' + year
+            try:
+                ES_year_cnt = es_operator.get_index_document_count(index_name)
+            except NotFoundError:
+                # 因为钉钉关键词所以没有发钉钉
+                msg7 = (f"ES Index {index_name} not found.\n"
+                        f" 请检查原因\n"
+                        )
+                # print(msg7)
+
+                send_dingtalk_notification_es(msg7)
+                ES_year_cnt = 0
+            if ES_year_cnt is None:
+                ES_year_cnt = 0
+            es_year_cnt_dict[year] = ES_year_cnt
+            es_diff = ES_year_cnt - hive_cnt
+            if es_diff != 0:
+                msg5 = (
+                    f"-----------------------------\n"
+                    f"\n"
+                    f"{mgdb}_{catalog} - 数据一致性警告:ES{year}与大数据DWD的{year}数量不一致。\n\n"
+                    f"详细差异报告:\n"
+                    f"-----------------------------------------------------------------------\n"
+                    f"年份:{year}\n"
+                    f"ES{year} 计数:{ES_year_cnt}\n"
+                    f"大数据{year} 计数:{hive_cnt}\n"
+                    f"差异值:{es_diff}\n"
+                    f"-----------------------------------------------------------------------\n"
+                    f"\n"
+                    f"请检查原因 \n"
+                    f"\n"
+                    f"-----------------------------\n"
+                )
+                # print(msg5)
+                send_dingtalk_notification_es(msg5)
+        pretty_print(f'{NORM_CYN}{time.strftime("%Y-%m-%d %H:%M:%S", time.localtime())} '
+                     f'{NORM_MGT} 大数据dwd表名: {NORM_GRN}dwd.cts_{mgdb}_{catalog} '
+                     f'{NORM_MGT} 大数据hive_year_cnt_dict  {NORM_GRN}{hive_year_cnt_dict}'
+                     f'{NORM_MGT} es_year_cnt_dict  {NORM_GRN}{es_year_cnt_dict}'
+                     )
+
+
+def main():
+    CONFIG, _ = parse_args(sys.argv[1:])
+    dt = CONFIG.get('dt')
+    cdt = CONFIG.get('cdt')
+    spark = SparkSQL()
+    spark._final_spark_config = {'hive.exec.dynamic.partition': 'true',
+                                 'hive.exec.dynamic.partition.mode': 'nonstrict',
+                                 'spark.yarn.queue': 'cts',
+                                 'spark.sql.crossJoin.enabled': 'true',
+                                 'spark.executor.memory': '6g',
+                                 'spark.executor.memoryOverhead': '2048',
+                                 'spark.driver.memory': '4g',
+                                 'spark.executor.instances': "15",
+                                 'spark.executor.cores': '2'
+                                 }
+    sql = (f"select mgdb, catalog from task.mg_count_monitor "
+           f"where is_deleted = '0'")
+    res = spark.query(sql)[0].collect()
+    mgdbs_prod = {
+        'dwd表名': '大数据mongo库名',
+        'un_global_trade_tatistics': 'united_nations_stat',
+        "global_bol": "global_bol"
+    }
+    mgdbs_old = {
+        'dwd表名': 'old_mongo库名',
+        'un_global_trade_tatistics': 'united_nations_stat',
+        "global_bol": "global_sea"
+    }
+    catalogs = {
+        'im': 'shipments_imports',
+        'ex': 'shipments_exports',
+    }
+    #  添加需要排除的读 old_mongo 的数据库名称
+    excluded_dbs = ["un_global_trade_tatistics",
+                    "north_america_bol",
+                    "central_america_bol",
+                    "south_america_bol",
+                    "asia_bol",
+                    "middle_east_bol",
+                    "europe_bol",
+                    "africa_bol",
+                    "oceania_bol"]
+    # 以下用于测试
+    # res = [{"mgdb": "global_bol", "catalog": "im"}]
+    # res = [{"mgdb": "ethiopia", "catalog": "ex"}]
+    mirror_dbs = ["fiji"]
+    mirror_dbs_date = {"fiji_im": "20211101", "fiji_ex": "20211101"}
+    for record in res:
+        mgdb = record['mgdb']
+        catalog = record['catalog']
+
+        prod_mgdb = mgdbs_prod.get(record['mgdb'], mgdb)
+        old_mgdb = mgdbs_old.get(record['mgdb'], mgdb)
+
+        if mgdb == "global_bol":
+            old_cnt = get_old_count(old_mgdb, catalogs[catalog])
+            # oldmongo和dwd拆分表
+            clu_cnt = get_bigdata_global_bol_count(catalog, dt, spark)
+            bigdata_count = get_bigdata_global_bol_count(catalog, dt, spark)
+            date_null_cnt=get_clu_count_null(mgdb, catalogs[catalog])
+        else:
+            old_cnt = get_old_count(prod_mgdb, catalogs[catalog])
+            clu_cnt = get_clu_count(prod_mgdb, catalogs[catalog])
+            bigdata_count = get_bigdata_count(mgdb, catalog, dt, spark,cdt)
+            date_null_cnt = get_clu_count_null(mgdb, catalogs[catalog])
+            # get_year_count(mgdb, catalog, dt, spark)
+        if mgdb in mirror_dbs:
+            clu_cnt = get_count_range_date(mgdb, catalogs[catalog], target_date=mirror_dbs_date[f"{mgdb}_{catalog}"])
+            print(f"{mgdb}{catalogs[catalog]} clu_cnt: {clu_cnt}")
+        # 两个mongo数据量对比
+        cnt_diff = old_cnt - clu_cnt
+        # oldmongo和dwd 对比
+        bd_diff = old_cnt - bigdata_count
+
+        if bd_diff != 0 or cnt_diff != 0 or date_null_cnt != 0:
+            msg3 = (
+                f"\n"
+                f"--------------------------------\n"
+                f"数据一致性警告\n"
+                f"--------------------------------\n"
+                f"在 {mgdb}_{catalog}  详细差异报告:\n\n"
+                f"\n"
+                f"--------------------------------\n"
+                f"计数对比:\n"
+                f"  old_mongo 计数: {old_cnt}\n"
+                f"  大数据_mongo 计数: {clu_cnt}\n"
+                f"  大数据平台 DWD 计数: {bigdata_count}\n"
+                f"  大数据_mongo `date`字段为空 计数: {date_null_cnt}\n"
+                f"\n"
+                f"请检查原因 \n"
+                f"\n"
+                f"--------------------------------\n"
+            )
+            if mgdb not in excluded_dbs:
+                send_dingtalk_notification(msg3)
+
+        # 添加最终各个国家的统计数据量
+        statistical_time=time.strftime("%Y-%m-%d %H:%M:%S", time.localtime())
+        sql_insert_cnt=f"""
+        
+        insert into table task.cts_country_count 
+        select '{mgdb}','{catalog}',{clu_cnt},'{statistical_time}','{dt}'
+        
+        """
+        spark.query(sql_insert_cnt)[0].collect()
+
+
+    sql_overwrite_cnt = f"""
+
+INSERT overwrite TABLE task.cts_country_count
+SELECT country,
+       catalog,
+       cnt,
+       creat_time,
+       dt
+FROM
+  ( SELECT *,
+           row_number() over (partition BY country,catalog
+                              ORDER BY `creat_time` DESC) AS rk
+  FROM task.cts_country_count
+  WHERE dt ={dt}   ) tmp 
+where rk =1
+           """
+    spark.query(sql_overwrite_cnt)[0].collect()
+    check_call_count()
+
+if __name__ == '__main__':
+    main()
+
+
+# CREATE TABLE task.cts_country_count
+# (
+#     `country`    string COMMENT 'mgdb',
+#     `catalog`    string COMMENT '进出口类型',
+#     `cnt`        bigint comment '数据量',
+#     `creat_time` STRING COMMENT '统计时间'
+# )
+#     PARTITIONED BY ( `dt` string )
+#     TBLPROPERTIES ( 'COMMENT' = '同步到大数据平台的数据量统计');

+ 368 - 0
dw_base/scheduler/dingtalk_task_monitor_new.py

@@ -0,0 +1,368 @@
+# 用于钉钉监控T+1任务是否需要重跑
+import sys
+import re
+import os
+import requests
+import json
+
+abspath = os.path.abspath(__file__)
+root_path = re.sub(r"tendata-warehouse.*", "tendata-warehouse", abspath)
+sys.path.append(root_path)
+from dw_base.spark.spark_sql import SparkSQL
+from dw_base.utils.log_utils import pretty_print
+from configparser import ConfigParser
+from datetime import time, datetime
+from pymongo import MongoClient
+from dw_base import *
+from dw_base.scheduler.polling_scheduler import get_mongo_client
+from dw_base.utils.config_utils import parse_args
+from dw_base.scheduler.mg2es.conf_reader import ConfReader
+from dw_base.scheduler.mg2es.es_operator import ESOperator
+from elasticsearch.exceptions import NotFoundError
+
+call_count = 0
+
+
+def check_call_count():
+    global call_count
+    if call_count == 0:
+        pretty_print(f'{NORM_CYN}{time.strftime("%Y-%m-%d %H:%M:%S", time.localtime())} '
+                     f'{NORM_MGT}向后传递参数: {NORM_GRN}is_run => 1 '
+                     f'{NORM_MGT} call_count =>{call_count}')
+        print('${setValue(is_run=%s)}' % '1')
+    else:
+        pretty_print(f'{NORM_CYN}{time.strftime("%Y-%m-%d %H:%M:%S", time.localtime())} '
+                     f'{NORM_MGT}向后传递参数: {NORM_GRN}is_run => 0 '
+                     f'{NORM_MGT} call_count =>{call_count}')
+        print('${setValue(is_run=%s)}' % '0')
+
+
+def send_dingtalk_notification(msg):
+    global call_count
+    call_count += 1
+    headers = {"Content-Type": "application/json"}
+    data = {
+        "msgtype": "text",
+        "text": {"content": msg}
+    }
+    json_data = json.dumps(data)
+    # 下面的url用于测试
+    url = 'https://oapi.dingtalk.com/robot/send?access_token=d4955560edf9d78fbf5273fe3ea4022ecf5955570a68ff710f7fe81926dff71e'
+    response = requests.post(url=url, data=json_data, headers=headers)
+    response.raise_for_status()
+
+def send_dingtalk_notification_es(msg):
+    headers = {"Content-Type": "application/json"}
+    data = {
+        "msgtype": "text",
+        "text": {"content": msg}
+    }
+    json_data = json.dumps(data)
+    # 下面的url用于测试
+    url = 'http://m1.node.cdh/dingtalk/api/robot/send?access_token=a4a48ed82627149f3317ee86e249fd7d973f5bed40fcac55cc2e7ca8d9ae0c61'
+    response = requests.post(url=url, data=json_data, headers=headers)
+    response.raise_for_status()
+
+
+def get_mongo_client(conf_path):
+    config_parser = ConfigParser()
+    config_parser.read(root_path + conf_path)
+    url = config_parser.get('base', 'address')
+    return MongoClient(url)
+
+
+def get_count(client, mgdb, mgtbl):
+    db = client[mgdb]
+    collection = db[mgtbl]
+    return collection.count()
+def get_count_null(client, mgdb, mgtbl):
+    db = client[mgdb]
+    collection = db[mgtbl]
+    # 计数`date`字段不为null的文档
+    # return  collection.count_documents({'date': {'$ne': None}})
+    # 计数`date` 为null的文档
+    return  collection.count_documents({'date': None})
+
+
+def get_old_count(mgdb, mgtbl):
+    client = get_mongo_client('/../datasource/mongo/mongo-cts-prod-old.ini')
+    result = get_count(client, mgdb, mgtbl)
+    pretty_print(f'{NORM_CYN}{time.strftime("%Y-%m-%d %H:%M:%S", time.localtime())} '
+                 f'{NORM_MGT} old source mongo: {NORM_GRN}{mgdb}.{mgtbl} '
+                 f'{NORM_MGT} old data count: {NORM_GRN}{result}')
+    return result
+def get_clu_count_null(mgdb, mgtbl):
+    client = get_mongo_client('/../datasource/mongo/mongo-cluster-cts-prod.ini')
+    result = get_count_null(client, mgdb, mgtbl)
+    pretty_print(f'{NORM_CYN}{time.strftime("%Y-%m-%d %H:%M:%S", time.localtime())} '
+                 f'{NORM_MGT} old source mongo: {NORM_GRN}{mgdb}.{mgtbl} '
+                 f'{NORM_MGT} old data count: {NORM_GRN}{result}')
+    return result
+def get_dev_count_null(mgdb, mgtbl):
+    client = get_mongo_client('/../datasource/mongo/mongo-cts-dev-rw-200-test.ini')
+    result = get_count_null(client, mgdb, mgtbl)
+    pretty_print(f'{NORM_CYN}{time.strftime("%Y-%m-%d %H:%M:%S", time.localtime())} '
+                 f'{NORM_MGT} old source mongo: {NORM_GRN}{mgdb}.{mgtbl} '
+                 f'{NORM_MGT} old data count: {NORM_GRN}{result}')
+    return result
+
+
+def get_clu_count(mgdb, mgtbl):
+    client = get_mongo_client('/../datasource/mongo/mongo-cluster-cts-prod.ini')
+    result = get_count(client, mgdb, mgtbl)
+    pretty_print(f'{NORM_CYN}{time.strftime("%Y-%m-%d %H:%M:%S", time.localtime())} '
+                 f'{NORM_MGT} 大数据集群mongo sink mongo: {NORM_GRN}{mgdb}.{mgtbl} '
+                 f'{NORM_MGT} 大数据集群mongo data count: {NORM_GRN}{result}')
+    return result
+
+
+def get_bigdata_count(mgdb, mgtbl, dt, spark,cdt):
+    sql = (f"select count(1) cnt "
+           f"from dwd.cts_{mgdb}_{mgtbl} "
+           f" where dt in ('19700101', {dt},{cdt}) ")
+    res = spark.query(sql)[0].collect()
+    pretty_print(f'{NORM_CYN}{time.strftime("%Y-%m-%d %H:%M:%S", time.localtime())} '
+                 f'{NORM_MGT} 大数据dwd表名: {NORM_GRN}dwd.cts_{mgdb}_{mgtbl} '
+                 f'{NORM_MGT} 大数据dwd 1970+昨日分区+当日分区 count: {NORM_GRN}{res[0].cnt}')
+    return res[0].cnt
+
+
+def get_bigdata_global_bol_count(catalog, dt, spark):
+    sql = (f"""
+    select sum(cnt) cnt from (select count(1) cnt from dwd.`cts_north_america_bol_{catalog}`   where dt in ('19700101', {dt}) 
+union all select count(1) from dwd.`cts_central_america_bol_{catalog}`  where dt in ('19700101', {dt}) 
+union all select count(1) from dwd.`cts_south_america_bol_{catalog}`    where dt in ('19700101', {dt}) 
+union all select count(1) from dwd.`cts_asia_bol_{catalog}`             where dt in ('19700101', {dt}) 
+union all select count(1) from dwd.`cts_middle_east_bol_{catalog}`      where dt in ('19700101', {dt}) 
+union all select count(1) from dwd.`cts_europe_bol_{catalog}`           where dt in ('19700101', {dt}) 
+union all select count(1) from dwd.`cts_africa_bol_{catalog}`           where dt in ('19700101', {dt}) 
+union all select count(1) from dwd.`cts_oceania_bol_{catalog}`          where dt in ('19700101', {dt})                 ) a""")
+    res = spark.query(sql)[0].collect()
+    pretty_print(f'{NORM_CYN}{time.strftime("%Y-%m-%d %H:%M:%S", time.localtime())} '
+                 f'{NORM_MGT} 大数据dwd表名: global_bol 1拆8 '
+                 f'{NORM_MGT} 大数据dwd 1970+昨日分区count: {NORM_GRN}{res[0].cnt}')
+    return res[0].cnt
+
+
+def get_year_count(mgdb, catalog, dt, spark):
+    if mgdb != "global_bol":
+        sql = (f"select from_unixtime(cast(`date`/1000 as int)- 8 * 60 * 60, 'yyyy') as year,count(1) hive_cnt "
+               f"from dwd.cts_{mgdb}_{catalog} "
+               f" where dt in ('19700101', {dt}) "
+               f" group by from_unixtime(cast(`date`/1000 as int)- 8 * 60 * 60, 'yyyy')"
+               f" order by from_unixtime(cast(`date`/1000 as int)- 8 * 60 * 60, 'yyyy')")
+        res = spark.query(sql)[0].collect()
+        hive_year_cnt_dict = {}
+        es_year_cnt_dict = {}
+
+        host, port = ConfReader().get_es_conf()
+        es_operator = ESOperator(host, port)
+        for record in res:
+            year = record['year']
+            hive_cnt = record['hive_cnt']
+            hive_year_cnt_dict[year] = hive_cnt
+            # index_name = 'customs_' + str(catalogs[catalog]) + '_' + mgdb + '-' + year
+            index_name = str(catalog) + '_' + mgdb + '-' + year
+            try:
+                ES_year_cnt = es_operator.get_index_document_count(index_name)
+            except NotFoundError:
+                # 因为钉钉关键词所以没有发钉钉
+                msg7 = (f"ES Index {index_name} not found.\n"
+                        f" 请检查原因\n"
+                        )
+                # print(msg7)
+
+                send_dingtalk_notification_es(msg7)
+                ES_year_cnt = 0
+            if ES_year_cnt is None:
+                ES_year_cnt = 0
+            es_year_cnt_dict[year] = ES_year_cnt
+            es_diff = ES_year_cnt - hive_cnt
+            if es_diff != 0:
+                msg5 = (
+                    f"-----------------------------\n"
+                    f"\n"
+                    f"{mgdb}_{catalog} - 数据一致性警告:ES{year}与大数据DWD的{year}数量不一致。\n\n"
+                    f"详细差异报告:\n"
+                    f"-----------------------------------------------------------------------\n"
+                    f"年份:{year}\n"
+                    f"ES{year} 计数:{ES_year_cnt}\n"
+                    f"大数据{year} 计数:{hive_cnt}\n"
+                    f"差异值:{es_diff}\n"
+                    f"-----------------------------------------------------------------------\n"
+                    f"\n"
+                    f"请检查原因 \n"
+                    f"\n"
+                    f"-----------------------------\n"
+                )
+                # print(msg5)
+                send_dingtalk_notification_es(msg5)
+        pretty_print(f'{NORM_CYN}{time.strftime("%Y-%m-%d %H:%M:%S", time.localtime())} '
+                     f'{NORM_MGT} 大数据dwd表名: {NORM_GRN}dwd.cts_{mgdb}_{catalog} '
+                     f'{NORM_MGT} 大数据hive_year_cnt_dict  {NORM_GRN}{hive_year_cnt_dict}'
+                     f'{NORM_MGT} es_year_cnt_dict  {NORM_GRN}{es_year_cnt_dict}'
+                     )
+def get_count_range_date(mgdb, mgtbl, target_date):
+    """
+    统计 date 字段值小于目标日期的文档总数
+    Args:
+        client: MongoDB客户端实例
+        mgdb: 数据库名称
+        mgtbl: 集合名称
+        target_date_str: 目标日期字符串 (格式: "YYYYMMDD")
+
+    Returns:
+        int: 符合条件的文档数量
+    """
+    client = get_mongo_client('/../datasource/mongo/mongo-cluster-cts-prod.ini')
+    db = client[mgdb]
+    collection = db[mgtbl]
+
+    # 将输入的字符串转换为 datetime 对象
+    target_date = datetime.strptime(target_date, "%Y%m%d").replace(
+        tzinfo=None  # 如果数据库时间不带时区,可以移除此行
+    )
+
+    count = collection.count_documents({'date': {'$lt': target_date}})
+    return count
+
+
+
+def main():
+    CONFIG, _ = parse_args(sys.argv[1:])
+    dt = CONFIG.get('dt')
+    cdt = CONFIG.get('cdt')
+    spark = SparkSQL()
+    spark._final_spark_config = {'hive.exec.dynamic.partition': 'true',
+                                 'hive.exec.dynamic.partition.mode': 'nonstrict',
+                                 'spark.yarn.queue': 'cts',
+                                 'spark.sql.crossJoin.enabled': 'true',
+                                 'spark.executor.memory': '6g',
+                                 'spark.executor.memoryOverhead': '2048',
+                                 'spark.driver.memory': '4g',
+                                 'spark.executor.instances': "15",
+                                 'spark.executor.cores': '2'
+                                 }
+    sql = (f"select mgdb, catalog from task.mg_count_monitor "
+           f"where is_deleted = '0'")
+    res = spark.query(sql)[0].collect()
+    mgdbs_prod = {
+        'dwd表名': '大数据mongo库名',
+        'un_global_trade_tatistics': 'united_nations_stat',
+        "global_bol": "global_bol"
+    }
+    mgdbs_old = {
+        'dwd表名': 'old_mongo库名',
+        'un_global_trade_tatistics': 'united_nations_stat',
+        "global_bol": "global_sea"
+    }
+    catalogs = {
+        'im': 'shipments_imports',
+        'ex': 'shipments_exports',
+    }
+    #  添加需要排除的读 old_mongo 的数据库名称
+    excluded_dbs = ["un_global_trade_tatistics",
+                    "north_america_bol",
+                    "central_america_bol",
+                    "south_america_bol",
+                    "asia_bol",
+                    "middle_east_bol",
+                    "europe_bol",
+                    "africa_bol",
+                    "oceania_bol"]
+    # 以下用于测试
+    # res = [{"mgdb": "global_bol", "catalog": "im"}]
+    # res = [{"mgdb": "ethiopia", "catalog": "ex"}]
+    mirror_dbs = ["fiji"]
+    mirror_dbs_date = {"fiji_im": "20211101", "fiji_ex": "20211101"}
+    for record in res:
+        mgdb = record['mgdb']
+        catalog = record['catalog']
+
+        prod_mgdb = mgdbs_prod.get(record['mgdb'], mgdb)
+        old_mgdb = mgdbs_old.get(record['mgdb'], mgdb)
+
+        if mgdb == "global_bol":
+            old_cnt = get_old_count(old_mgdb, catalogs[catalog])
+            # oldmongo和dwd拆分表
+            clu_cnt = get_bigdata_global_bol_count(catalog, dt, spark)
+            bigdata_count = get_bigdata_global_bol_count(catalog, dt, spark)
+            date_null_cnt=get_clu_count_null(mgdb, catalogs[catalog])
+        else:
+            old_cnt = get_old_count(prod_mgdb, catalogs[catalog])
+            clu_cnt = get_clu_count(prod_mgdb, catalogs[catalog])
+            bigdata_count = get_bigdata_count(mgdb, catalog, dt, spark,cdt)
+            date_null_cnt = get_clu_count_null(mgdb, catalogs[catalog])
+            # get_year_count(mgdb, catalog, dt, spark)
+        if mgdb in mirror_dbs:
+            clu_cnt = get_count_range_date(mgdb, catalogs[catalog], target_date=mirror_dbs_date[f"{mgdb}_{catalog}"])
+            print(f"{mgdb}{catalogs[catalog]} clu_cnt: {clu_cnt}")
+        # 两个mongo数据量对比
+        cnt_diff = old_cnt - clu_cnt
+        # oldmongo和dwd 对比
+        bd_diff = old_cnt - bigdata_count
+
+        if bd_diff != 0 or cnt_diff != 0 or date_null_cnt != 0:
+            msg3 = (
+                f"\n"
+                f"--------------------------------\n"
+                f"数据一致性警告\n"
+                f"--------------------------------\n"
+                f"在 {mgdb}_{catalog}  详细差异报告:\n\n"
+                f"\n"
+                f"--------------------------------\n"
+                f"计数对比:\n"
+                f"  old_mongo 计数: {old_cnt}\n"
+                f"  大数据_mongo 计数: {clu_cnt}\n"
+                f"  大数据平台 DWD 计数: {bigdata_count}\n"
+                f"  大数据_mongo `date`字段为空 计数: {date_null_cnt}\n"
+                f"\n"
+                f"请检查原因 \n"
+                f"\n"
+                f"--------------------------------\n"
+            )
+            if mgdb not in excluded_dbs:
+                send_dingtalk_notification(msg3)
+
+        # 添加最终各个国家的统计数据量
+        statistical_time=time.strftime("%Y-%m-%d %H:%M:%S", time.localtime())
+        sql_insert_cnt=f"""
+        
+        insert into table task.cts_country_count 
+        select '{mgdb}','{catalog}',{clu_cnt},'{statistical_time}','{dt}'
+        
+        """
+        spark.query(sql_insert_cnt)[0].collect()
+
+
+    sql_overwrite_cnt = f"""
+
+INSERT overwrite TABLE task.cts_country_count
+SELECT country,
+       catalog,
+       cnt,
+       creat_time,
+       dt
+FROM
+  ( SELECT *,
+           row_number() over (partition BY country,catalog
+                              ORDER BY `creat_time` DESC) AS rk
+  FROM task.cts_country_count
+  WHERE dt ={dt}   ) tmp 
+where rk =1
+           """
+    spark.query(sql_overwrite_cnt)[0].collect()
+    check_call_count()
+
+if __name__ == '__main__':
+    main()
+
+# CREATE TABLE task.cts_country_count
+# (
+#     `country`    string COMMENT 'mgdb',
+#     `catalog`    string COMMENT '进出口类型',
+#     `cnt`        bigint comment '数据量',
+#     `creat_time` STRING COMMENT '统计时间'
+# )
+#     PARTITIONED BY ( `dt` string )
+#     TBLPROPERTIES ( 'COMMENT' = '同步到大数据平台的数据量统计');

+ 45 - 0
dw_base/scheduler/drop_daily_full_snapshot_tbls.py

@@ -0,0 +1,45 @@
+# 用于仅保留dwd 近7日和19700101 数据
+# daily_full_snapshot_tbls
+# task.daily_full_snapshot_tbls
+# 测试参数  -monitor_db test
+# 生产参数  -monitor_db task  可以不赋值
+import datetime
+import sys
+import re
+import os
+
+abspath = os.path.abspath(__file__)
+root_path = re.sub(r"tendata-warehouse.*", "tendata-warehouse", abspath)
+sys.path.append(root_path)
+from dw_base.spark.spark_sql import SparkSQL
+from dw_base.utils.config_utils import parse_args
+
+
+def main():
+    CONFIG, _ = parse_args(sys.argv[1:])
+    monitor_db = CONFIG.get('monitor_db', 'task')
+    spark = SparkSQL()
+
+    sql1 = (f"select db, tbl,days from {monitor_db}.daily_full_snapshot_tbls "
+            f"where is_deleted = '0'")
+    res = spark.query(sql1)[0].collect()
+    for record in res:
+        db = record['db']
+        tbl = record['tbl']
+        ds = record['days']
+        days_ago = datetime.datetime.now() - datetime.timedelta(days = ds)
+        format_date = days_ago.strftime('%Y%m%d')
+        sql2 = (f"SHOW PARTITIONS {db}.{tbl}")
+        partitions = spark.query(sql2)[0].collect()
+        dts = set()
+        for dt in partitions:
+            a1 = dt['partition'].split('=')[1][:8]
+            if a1 < format_date and a1 != '19700101' and a1 != '20200101':
+                dts.add(dt['partition'].split('=')[1][:8])
+        for p in dts:
+            sql3 = f" alter TABLE {db}.{tbl} DROP PARTITION ( dt='{p}') "
+            spark.query(sql3)[0].collect()
+
+
+if __name__ == "__main__":
+    main()

+ 45 - 0
dw_base/scheduler/drop_partitions.py

@@ -0,0 +1,45 @@
+# 用于仅保留dwd 近7日和19700101 数据
+# 测试参数  -slect_db tmp -drop_db dwd_smp
+# 生产参数  -slect_db task -drop_db dwd
+import datetime
+import sys
+import re
+import os
+
+abspath = os.path.abspath(__file__)
+root_path = re.sub(r"tendata-warehouse.*", "tendata-warehouse", abspath)
+sys.path.append(root_path)
+from dw_base.spark.spark_sql import SparkSQL
+from dw_base.utils.config_utils import parse_args
+
+def main():
+    CONFIG, _ = parse_args(sys.argv[1:])
+    slect_db = CONFIG.get('slect_db')
+    drop_db = CONFIG.get('drop_db')
+    spark=SparkSQL()
+
+    seven_days_ago = datetime.datetime.now() - datetime.timedelta(days=4)
+    format_date = seven_days_ago.strftime('%Y%m%d')
+
+    sql1 = (f"select mgdb, catalog from {slect_db}.mg_count_monitor "
+            f"where is_deleted = '0'")
+    res = spark.query(sql1)[0].collect()
+    extends_db=["global_bol"]
+    for record in res:
+        mgdb = record['mgdb']
+        catalog = record['catalog']
+        if mgdb not in extends_db:
+            sql2 = (f"SHOW PARTITIONS {drop_db}.cts_{mgdb}_{catalog}")
+            partitions = spark.query(sql2)[0].collect()
+            dts = []
+            for dt in partitions:
+                a1 = dt['partition'].split('=')[1]
+                if a1 < format_date and a1 != '19700101' and a1 != '20200101':
+                    dts.append(dt['partition'].split('=')[1])
+            for p in dts:
+                sql3 = f" alter TABLE {drop_db}.cts_{mgdb}_{catalog} DROP PARTITION ( dt='{p}') "
+                spark.query(sql3)[0].collect()
+
+
+if __name__ == "__main__":
+    main()

+ 498 - 0
dw_base/scheduler/ent_interface_dingtalk.py

@@ -0,0 +1,498 @@
+import base64
+import hashlib
+import hmac
+import sys
+import re
+import os
+import urllib
+import time
+import requests
+
+abspath = os.path.abspath(__file__)
+root_path = re.sub(r"tendata-warehouse.*", "tendata-warehouse", abspath)
+sys.path.append(root_path)
+from dw_base.utils.config_utils import parse_args
+from dw_base.spark.spark_sql import SparkSQL
+import http.client
+import json
+
+from cryptography.hazmat.primitives.asymmetric import rsa, padding
+from cryptography.hazmat.primitives import serialization
+from base64 import b64encode
+
+# 公钥
+public_key_pem = b"""
+-----BEGIN PUBLIC KEY-----
+MIGfMA0GCSqGSIb3DQEBAQUAA4GNADCBiQKBgQDSaL/mqfq/30d5w6/05EL4073z
+ZgsomKTDI9wKUyz+ETkGwWzaNQm8BAXk9nJMCPz25fCTPd2BkifrS2KFKK2+e4hU
+pQxs+FQGaSeR8YEBWsCwh8bWaFWgxKuWpPPdfP6Vcnid/pTAsjbnw0KIHT7x83WZ
+qQTu3GUdyXkfyB41CQIDAQAB
+-----END PUBLIC KEY-----
+"""
+
+
+class UserInfo:
+    """公司名称"""
+    company_name: str
+    """真实名称"""
+    name: str
+    """用户id"""
+    user_id: int
+    """用户名"""
+    username: str
+
+    def __init__(self, company_name: str, name: str, user_id: int, username: str) -> None:
+        self.company_name = company_name
+        self.name = name
+        self.user_id = user_id
+        self.username = username
+
+    def __str__(self) -> str:
+        return (f"UserInfo:\n"
+                f"  Company Name: {self.company_name}\n"
+                f"  Name: {self.name}\n"
+                f"  User ID: {self.user_id}\n"
+                f"  Username: {self.username}")
+
+
+def encrypt_user_id(user_id):
+    public_key = serialization.load_pem_public_key(public_key_pem)
+    encrypted = public_key.encrypt(
+        user_id.encode(),
+        padding.PKCS1v15()
+    )
+    return b64encode(encrypted).decode()
+
+
+def get_user_info(user_id):
+    encrypted_user_id = encrypt_user_id(user_id)
+    conn = http.client.HTTPConnection("192.168.11.6", 18080)
+    payload = json.dumps({
+        "encryptUserId": encrypted_user_id
+    })
+    headers = {
+        'User-Agent': 'Apifox/1.0.0 (https://apifox.com)',
+        'Content-Type': 'application/json'
+    }
+
+    try:
+        conn.request("POST", "/account/personal", payload, headers)
+        res = conn.getresponse()
+        resdata = res.read().decode("utf-8")
+        res_json = json.loads(resdata)
+        user_info = UserInfo(res_json['companyName'], res_json['name'], res_json['userId'], res_json['username'])
+        return user_info
+    except Exception as e:
+        print("Error:", e)
+    finally:
+        conn.close()
+
+
+spark = SparkSQL(udf_files=['dw_base/spark/udf/contacts/ctc_common.py',
+                            'dw_base/spark/udf/spark_id_generate_udf.py'])
+
+
+def get_sign(secret):
+    timestamp = str(round(time.time() * 1000))
+    secret_enc = secret.encode('utf-8')
+    string_to_sign = '{}\n{}'.format(timestamp, secret)
+    string_to_sign_enc = string_to_sign.encode('utf-8')
+    hmac_code = hmac.new(secret_enc, string_to_sign_enc, digestmod=hashlib.sha256).digest()
+    sign = urllib.parse.quote_plus(base64.b64encode(hmac_code))
+    return timestamp, sign
+
+
+def send_dingtalk_markdown(msg):
+    headers = {"Content-Type": "application/json"}
+    data = {
+        "msgtype": "markdown",
+        "markdown": {"title": '企业库告警', "text": msg, }
+    }
+    json_data = json.dumps(data)
+    secret = 'SECffb7fe1b4c3aacc7be85d3b03de88fdbf93dfb48fe1c13ea7dba34a84847675e'
+    timestamp, sign = get_sign(secret)
+    url = f'https://oapi.dingtalk.com/robot/send?access_token=ffdb7df856220a925196e911107a4aa259acb2fd1160fee8b11d0c3c800974fc&timestamp={timestamp}&sign={sign}'
+    response = requests.post(url=url, data=json_data, headers=headers)
+    response.raise_for_status()
+
+
+def send_dingtalk_notification(msg):
+    headers = {"Content-Type": "application/json"}
+    data = {
+        "msgtype": "text",
+        "text": {"content": msg}
+    }
+    json_data = json.dumps(data)
+    url = 'https://oapi.dingtalk.com/robot/send?access_token=5183dfe1ecbe06261bcac7b45c1a6b5ae101fec67877d74120a6a95c88d1f917'
+    # url = 'https://oapi.dingtalk.com/robot/send?access_token=c4086d8ba377fdade2dff869e71063733095bc718d3bafdfbe8be0966aa050d6'
+    # url = 'https://oapi.dingtalk.com/robot/send?access_token=bee997dbf61e839a17de087830ffef6e864c3109fef62a956703bdfe043b0e10'
+    response = requests.post(url=url, data=json_data, headers=headers)
+    response.raise_for_status()
+
+
+# shh非核心业务调用数
+def get_shh_non_core_interface_cnt(dt):
+    sql = f'''
+SELECT sum(cnt) cnt
+FROM
+  (SELECT count(1) cnt
+   FROM ent_raw.interface_base
+   WHERE topic = "ent_monitor_interface"
+     AND dt = "{dt}"
+     AND GET_JSON_OBJECT(ori_json, "$.type") != "EXPORT"
+     AND GET_JSON_OBJECT(ori_json, "$.source")= 'CONTACT'
+   UNION ALL SELECT count(1) cnt
+   FROM ent_raw.interface_base
+   WHERE topic = "ent_shh_bizr_interface"
+     AND dt = "{dt}"
+     AND GET_JSON_OBJECT(ori_json, "$.type") IN("ROOT",
+                                                "COMPANY_COUNT")
+     AND GET_JSON_OBJECT(ori_json, "$.source")= 'BIZR'
+   UNION ALL SELECT count(1) cnt
+   FROM ent_raw.interface_base
+   WHERE topic = "ent_shh_mecs_interface"
+     AND dt = "{dt}"
+     AND GET_JSON_OBJECT(ori_json, "$.type") IN("CORP",
+                                                "SITE")
+     AND GET_JSON_OBJECT(ori_json, "$.source")= 'MECS'
+      UNION ALL SELECT count(1) cnt
+   FROM ent_raw.interface_base
+   WHERE topic = "ent_shh_interface"
+     AND dt = "{dt}"
+     AND GET_JSON_OBJECT(ori_json, "$.type")= "BIZR"
+     AND GET_JSON_OBJECT(ori_json, "$.source")= "BIZR"
+     )t
+     '''
+    return spark.query(sql)[0].collect()[0]['cnt']
+
+
+def get_shh_company_interface_cnt(dt):
+    sql = f'select count(1) cnt from ent_ods.ent_shh_api_company_logs where dt = "{dt}" and source != "SCRIPT"'
+    return spark.query(sql)[0].collect()[0]['cnt']
+
+
+def get_shh_company_interface_script_cnt(dt):
+    sql = f'select count(1) cnt from ent_ods.ent_shh_api_company_logs where dt = "{dt}" and source = "SCRIPT"'
+    return spark.query(sql)[0].collect()[0]['cnt']
+
+
+def get_shh_contact_interface_cnt(dt):
+    sql = f'select count(1) cnt from ent_raw.interface_base where topic = "ctc_shh_interface" and dt = "{dt}" and GET_JSON_OBJECT(ori_json, "$.source") != "SCRIPT"'
+    return spark.query(sql)[0].collect()[0]['cnt']
+
+
+def get_shh_contact_interface_script_cnt(dt):
+    sql = f'select count(1) cnt from ent_raw.interface_base where topic = "ctc_shh_interface" and dt = "{dt}" and GET_JSON_OBJECT(ori_json, "$.source") = "SCRIPT"'
+    return spark.query(sql)[0].collect()[0]['cnt']
+
+
+def get_snv_contact_interface_cnt(dt):
+    sql = f'select count(1) cnt from ent_raw.interface_base where topic = "ctc_snovio_interface" and dt = "{dt}" and GET_JSON_OBJECT(ori_json, "$.source") != "MANUAL_CONSUME" '
+    return spark.query(sql)[0].collect()[0]['cnt']
+
+
+def get_snv_contact_interface_script_cnt(dt):
+    sql = f'select count(1) cnt from ent_raw.interface_base where topic = "ctc_snovio_interface" and dt = "{dt}" and GET_JSON_OBJECT(ori_json, "$.source") = "MANUAL_CONSUME" '
+    return spark.query(sql)[0].collect()[0]['cnt']
+
+
+def ent_user_top(dt):
+    sql = (f"select GET_JSON_OBJECT(ori_json, '$.params.userId') as  user ,count(1) as cnt from ent_raw.interface_base "
+           f"where dt='{dt}' and  topic = 'ent_tendata_interface' and GET_JSON_OBJECT(ori_json, '$.type') = 'BRIEF_RESULT' group by GET_JSON_OBJECT(ori_json, '$.params.userId') order by count(1) desc limit 10"
+           )
+    body = ''
+    for row in spark.query(sql)[0].collect():
+        userid = row.user
+        user_info = get_user_info(userid)
+        body += f'{user_info.username},{user_info.name},{user_info.company_name},**{row.cnt}**次 \n\n'
+    return body
+
+def get_manual_request_cnt(dt):
+    sql = f'''SELECT count(DISTINCT GET_JSON_OBJECT(ori_json, '$.params.traceId')) manual_request_cnt,
+              count(distinct GET_JSON_OBJECT(ori_json, '$.params.userId'))                  as user_cnt
+   FROM ent_raw.interface_base
+   WHERE topic = 'ent_tendata_interface'
+     AND dt = '{dt}'
+     AND get_json_object(ori_json, '$.source') = 'BING'
+     AND get_json_object(ori_json, '$.type') = 'MANUAL_REFRESH'
+     AND get_json_object(ori_json, '$.result.canRefresh') = 'true'
+     '''
+    return spark.query(sql)[0].collect()[0]
+def get_ggl_res(dt):
+    sql = f'''WITH MANUAL AS
+  (SELECT DISTINCT GET_JSON_OBJECT(ori_json, '$.params.traceId') trace_id
+   FROM ent_raw.interface_base
+   WHERE topic = 'ent_tendata_interface'
+     AND dt = '{dt}'
+     AND get_json_object(ori_json, '$.source') = 'BING'
+     AND get_json_object(ori_json, '$.type') = 'MANUAL_REFRESH'
+     AND get_json_object(ori_json, '$.result.canRefresh') = 'true'), auto AS
+  (SELECT DISTINCT GET_JSON_OBJECT(ori_json, '$.params.traceId') trace_id
+   FROM ent_raw.interface_base
+   WHERE topic = 'ent_tendata_interface'
+     AND dt = '{dt}'
+     AND get_json_object(ori_json, '$.source') = 'BING'
+     AND get_json_object(ori_json, '$.type') = 'AUTO_REFRESH'
+     AND get_json_object(ori_json, '$.result.canRefresh') = 'true'),
+                                                                     ods AS
+  (SELECT GET_JSON_OBJECT(ori_json, '$.params.traceId') trace_id,
+          GET_JSON_OBJECT(ori_json, '$.result.status_code') res_code
+   FROM ent_raw.interface_base
+   WHERE topic = 'ctc_google_interface'
+     AND dt = '{dt}'),
+                                                                     manual_res AS
+  (SELECT 'manual',
+          sum(if(ods.res_code = '200',1,0))  as cnt
+   FROM ods
+   JOIN MANUAL ON ods.trace_id = manual.trace_id),
+                                                                     auto_res AS
+  (SELECT 'auto',
+          sum(if(ods.res_code = '200',1,0))  as cnt
+   FROM ods
+   JOIN auto ON ods.trace_id = auto.trace_id),
+                                                                     ctc_cnt AS
+  (SELECT 'ctc_cnt',
+          count(1) as cnt
+   FROM ctc_mid.ctc_main_pre
+   WHERE dt = '{dt}'
+     AND SOURCE LIKE '%google%' )
+SELECT *
+FROM manual_res
+UNION ALL
+SELECT *
+FROM auto_res
+UNION ALL
+SELECT *
+FROM ctc_cnt
+    '''
+    row = spark.query(sql)[0].collect()
+    return row
+
+
+def get_manual_base(dt):
+    sql = f'''
+    select count(1)                                                                      as cnt,
+       count(distinct GET_JSON_OBJECT(ori_json, '$.params.userId'))                  as user_cnt,
+       nvl(sum(if(get_json_object(ori_json, '$.result.data.website') is not null, 1, 0)),0) as web_cnt
+from ent_raw.interface_base
+where topic = 'ent_tendata_interface'
+  and dt = '{dt}'
+  and get_json_object(ori_json, '$.source') = 'BING'
+  and get_json_object(ori_json, '$.type') = 'MANUAL'
+  '''
+    row = spark.query(sql)[0].collect()[0]
+    return row
+
+
+def get_auto_base(dt):
+    sql = f'''
+    select count(1)                                                                      as cnt,
+       count(distinct GET_JSON_OBJECT(ori_json, '$.params.userId'))                  as user_cnt,
+       sum(if(get_json_object(ori_json, '$.result.data.website') is not null, 1, 0)) as web_cnt
+from ent_raw.interface_base
+where topic = 'ent_tendata_interface'
+  and dt = '{dt}'
+  and get_json_object(ori_json, '$.source') = 'BING'
+  and get_json_object(ori_json, '$.type') = 'AUTO'
+  '''
+    row = spark.query(sql)[0].collect()[0]
+    return row
+
+
+def get_manual_cnt(dt):
+    sql = f'''
+    with webs as (select distinct get_json_object(ori_json, '$.result.data.website') as website
+              from ent_raw.interface_base
+              where topic = 'ent_tendata_interface'
+                and dt = '{dt}'
+                and get_json_object(ori_json, '$.source') = 'BING'
+                and get_json_object(ori_json, '$.type') = 'MANUAL'
+                and get_json_object(ori_json, '$.result.data.website') is not null),
+     tids as (select website, generate_tid(clean_website(website), 'not_null', null) as tid
+              from webs),
+     pre as (select i.id, i.tid, i.source
+             from ctc_mid.ctc_main_pre i
+                      join tids t on i.tid = t.tid
+             where i.dt = '{dt}'),
+     shh as (select 'shh'               as source,
+                    count(distinct tid) as res_cnt,
+                    count(id)           as ctc_cnt
+             from pre
+             where source like '%shh_%'),
+     snovio as (select 'snovio'            as source,
+                       count(distinct tid) as res_cnt,
+                       count(id)           as ctc_cnt
+                from pre
+                where source like '%snovio%'),
+     all_t as (select 'all'               as source,
+                    count(distinct tid) as res_cnt,
+                    count(id)           as ctc_cnt
+             from pre
+             where source like '%snovio%'
+                or source like '%shh_%')
+select *
+from shh
+union all
+select *
+from snovio
+union all
+select *
+from all_t
+  '''
+    res = spark.query(sql)[0].collect()
+    return res
+
+
+def get_auto_cnt(dt):
+    sql = f'''
+    with webs as (select distinct get_json_object(ori_json, '$.result.data.website') as website
+              from ent_raw.interface_base
+              where topic = 'ent_tendata_interface'
+                and dt = '{dt}'
+                and get_json_object(ori_json, '$.source') = 'BING'
+                and get_json_object(ori_json, '$.type') = 'AUTO'
+                and get_json_object(ori_json, '$.result.data.website') is not null),
+     tids as (select website, generate_tid(clean_website(website), 'not_null', null) as tid
+              from webs),
+     pre as (select i.id, i.tid, i.source
+             from ctc_mid.ctc_main_pre i
+                      join tids t on i.tid = t.tid
+             where i.dt = '{dt}'),
+     shh as (select 'shh'               as source,
+                    count(distinct tid) as res_cnt,
+                    count(id)           as ctc_cnt
+             from pre
+             where source like '%shh_%'),
+     snovio as (select 'snovio'            as source,
+                       count(distinct tid) as res_cnt,
+                       count(id)           as ctc_cnt
+                from pre
+                where source like '%snovio%'),
+     all_t as (select 'all'               as source,
+                    count(distinct tid) as res_cnt,
+                    count(id)           as ctc_cnt
+             from pre
+             where source like '%snovio%'
+                or source like '%shh_%')
+select *
+from shh
+union all
+select *
+from snovio
+union all
+select *
+from all_t
+  '''
+    res = spark.query(sql)[0].collect()
+    return res
+
+
+if __name__ == '__main__':
+    CONFIG, _ = parse_args(sys.argv[1:])
+    dts = CONFIG.get('dt').split(',')
+    for dt in dts:
+        format_dt = f'{dt[:4]}-{dt[4:6]}-{dt[6:]}'
+        shh_company_interface_cnt = get_shh_company_interface_cnt(dt)
+        shh_company_interface_script_cnt = get_shh_company_interface_script_cnt(dt)
+        shh_contact_interface_cnt = get_shh_contact_interface_cnt(dt)
+        shh_contact_interface_script_cnt = get_shh_contact_interface_script_cnt(dt)
+        snv_contact_interface_cnt = get_snv_contact_interface_cnt(dt)
+        snv_contact_interface_script_cnt = get_snv_contact_interface_script_cnt(dt)
+        shh_non_core_interface_cnt = get_shh_non_core_interface_cnt(dt)
+        msg = f'''【接口调用量统计】------------------------------------------
+统计日期: {format_dt}
+1、单接口调用公司信息次数: {shh_company_interface_cnt + shh_company_interface_script_cnt}
+①自然调用次数: {shh_company_interface_cnt}
+②脚本调用次数: {shh_company_interface_script_cnt}
+
+2、单接口调用联系人次数: {shh_contact_interface_cnt + shh_contact_interface_script_cnt}
+①自然调用次数: {shh_contact_interface_cnt}
+②脚本调用次数: {shh_contact_interface_script_cnt}
+
+3、snovio调用联系人次数: {snv_contact_interface_cnt + snv_contact_interface_script_cnt}
+①自然调用次数: {snv_contact_interface_cnt}
+②脚本调用次数: {snv_contact_interface_script_cnt}
+
+4、单接口非核心业务调用次数:{shh_non_core_interface_cnt}
+---------------------------------------------------------------'''
+        print(msg)
+        send_dingtalk_notification(msg)
+        ent_user_top_cnt = ent_user_top(dt)
+        msg = f'''### 企业主页接口调用统计top10
+> **统计日期 :  {format_dt}**
+
+{ent_user_top_cnt}
+                '''
+        print(msg)
+        send_dingtalk_markdown(msg)
+
+        manual_base = get_manual_base(dt)
+        auto_base = get_auto_base(dt)
+        manual_cnt = get_manual_cnt(dt)
+        auto_cnt = get_auto_cnt(dt)
+        manual_request_res = get_manual_request_cnt(dt)
+        manual_request_cnt = manual_request_res['manual_request_cnt']
+        manual_user_cnt = manual_request_res['user_cnt']
+        # 处理分母可能为0的情况
+        # manual_user_cnt = manual_base['user_cnt']
+        manual_cnt_total = manual_base['cnt']
+        manual_web_cnt = manual_base['web_cnt']
+        auto_user_cnt = auto_base['user_cnt']
+        auto_cnt_total = auto_base['cnt']
+        auto_web_cnt = auto_base['web_cnt']
+
+        manual_avg_requests = manual_request_cnt / manual_user_cnt if manual_user_cnt != 0 else 0
+        manual_web_percentage = 100 * manual_web_cnt / manual_cnt_total if manual_cnt_total != 0 else 0
+        manual_single_interface_percentage = 100 * manual_cnt[0][
+            'res_cnt'] / manual_web_cnt if manual_web_cnt != 0 else 0
+        manual_snovio_percentage = 100 * manual_cnt[1]['res_cnt'] / manual_web_cnt if manual_web_cnt != 0 else 0
+        manual_solution_percentage = 100 * manual_cnt[2]['res_cnt'] / manual_cnt_total if manual_cnt_total != 0 else 0
+
+        auto_avg_requests = auto_cnt_total / auto_user_cnt if auto_user_cnt != 0 else 0
+        auto_web_percentage = 100 * auto_web_cnt / auto_cnt_total if auto_cnt_total != 0 else 0
+        auto_single_interface_percentage = 100 * auto_cnt[0]['res_cnt'] / auto_web_cnt if auto_web_cnt != 0 else 0
+        auto_snovio_percentage = 100 * auto_cnt[1]['res_cnt'] / (auto_web_cnt - auto_cnt[0]['res_cnt']) if (
+                                                                                                                   auto_web_cnt -
+                                                                                                                   auto_cnt[
+                                                                                                                       0][
+                                                                                                                       'res_cnt']) != 0 else 0
+        auto_solution_percentage = 100 * (
+                auto_cnt[0]['res_cnt'] + auto_cnt[1]['res_cnt']) / auto_cnt_total if auto_cnt_total != 0 else 0
+        ggl_res = get_ggl_res(dt)
+        manual_ggl_cnt = ggl_res[0]['cnt']
+        auto_ggl_cnt = ggl_res[1]['cnt']
+        ctc_ggl_cnt = ggl_res[2]['cnt']
+        msg = f'''【手动/自动更新效果统计】------------------------------------------
+统计日期: {format_dt}
+1、手动更新
+①手动更新请求总人数:{manual_user_cnt}人
+②手动更新请求总次数:{manual_request_cnt}次
+③人均请求次数:{manual_avg_requests:.2f}次
+④手动请求bing网址总次数:{manual_cnt_total}次
+⑤bing获取到网址的次数及占比:{manual_web_cnt}次,{manual_web_percentage:.2f}%
+⑥单接口获取到联系人次数及占比:{manual_cnt[0]['res_cnt']}次,{manual_single_interface_percentage:.2f}%
+⑦单接口获取到联系人去重总数:{manual_cnt[0]['ctc_cnt']}
+⑧snovio接口获取到联系人次数及占比:{manual_cnt[1]['res_cnt']}次,{manual_snovio_percentage:.2f}%
+⑨snovio接口获取到联系人去重总数:{manual_cnt[1]['ctc_cnt']}
+⑩当日手动更新获得联系方式的总次数:{manual_cnt[2]['res_cnt']}
+⑪当日手动更新解决问题的百分比:{manual_solution_percentage:.2f}%
+
+2、自动更新
+①自动更新对应的总人数:{auto_user_cnt}人
+②自动更新请求总次数:{auto_cnt_total}次
+③人均对应自动更新次数:{auto_avg_requests:.2f}次
+④bing获取到网址的次数及占比:{auto_web_cnt}次,{auto_web_percentage:.2f}%
+⑤单接口获取到联系人次数及占比:{auto_cnt[0]['res_cnt']}次,{auto_single_interface_percentage:.2f}%
+⑥单接口获取到联系人去重总数:{auto_cnt[0]['ctc_cnt']}
+⑦snovio接口获取到联系人次数及占比:{auto_cnt[1]['res_cnt']}次,{auto_snovio_percentage:.2f}%
+⑧snovio接口获取到联系人去重总数:{auto_cnt[1]['ctc_cnt']}
+⑨当日自动更新获得联系方式的总次数:{auto_cnt[0]['res_cnt'] + auto_cnt[1]['res_cnt']}
+⑩当日自动更新解决问题的百分比:{auto_solution_percentage:.2f}%
+
+3、google补充
+①手动触发google爬虫获取到联系人次数:{manual_ggl_cnt}次
+②自动触发google爬虫获取到联系人次数:{auto_ggl_cnt}次
+③google爬虫获取到联系人去重数:{ctc_ggl_cnt}人
+--------------------------------------------------------------- '''
+        print(msg)
+        send_dingtalk_notification(msg)

+ 132 - 0
dw_base/scheduler/ent_interface_dingtalk_call.py

@@ -0,0 +1,132 @@
+import base64
+import hashlib
+import hmac
+import sys
+import re
+import os
+import urllib
+import time
+import requests
+
+abspath = os.path.abspath(__file__)
+root_path = re.sub(r"tendata-warehouse.*", "tendata-warehouse", abspath)
+sys.path.append(root_path)
+from dw_base.utils.config_utils import parse_args
+from dw_base.spark.spark_sql import SparkSQL
+import json
+
+spark = SparkSQL(udf_files=['dw_base/spark/udf/contacts/ctc_common.py',
+                            'dw_base/spark/udf/spark_id_generate_udf.py'])
+
+
+
+def send_dingtalk_notification(msg):
+    headers = {"Content-Type": "application/json"}
+    data = {
+        "msgtype": "text",
+        "text": {"content": msg}
+    }
+    json_data = json.dumps(data)
+    url = 'https://oapi.dingtalk.com/robot/send?access_token=5183dfe1ecbe06261bcac7b45c1a6b5ae101fec67877d74120a6a95c88d1f917'
+    # url = 'https://oapi.dingtalk.com/robot/send?access_token=c4086d8ba377fdade2dff869e71063733095bc718d3bafdfbe8be0966aa050d6'
+    # url = 'https://oapi.dingtalk.com/robot/send?access_token=bee997dbf61e839a17de087830ffef6e864c3109fef62a956703bdfe043b0e10'
+    response = requests.post(url=url, data=json_data, headers=headers)
+    response.raise_for_status()
+
+
+# shh非核心业务调用数
+def get_shh_non_core_interface_cnt(dt):
+    sql = f'''
+SELECT sum(cnt) cnt
+FROM
+  (SELECT count(1) cnt
+   FROM ent_raw.interface_base
+   WHERE topic = "ent_monitor_interface"
+     AND dt = "{dt}"
+     AND GET_JSON_OBJECT(ori_json, "$.type") != "EXPORT"
+     AND GET_JSON_OBJECT(ori_json, "$.source")= 'CONTACT'
+   UNION ALL SELECT count(1) cnt
+   FROM ent_raw.interface_base
+   WHERE topic = "ent_shh_bizr_interface"
+     AND dt = "{dt}"
+     AND GET_JSON_OBJECT(ori_json, "$.type") IN("ROOT",
+                                                "COMPANY_COUNT")
+     AND GET_JSON_OBJECT(ori_json, "$.source")= 'BIZR'
+   UNION ALL SELECT count(1) cnt
+   FROM ent_raw.interface_base
+   WHERE topic = "ent_shh_mecs_interface"
+     AND dt = "{dt}"
+     AND GET_JSON_OBJECT(ori_json, "$.type") IN("CORP",
+                                                "SITE")
+     AND GET_JSON_OBJECT(ori_json, "$.source")= 'MECS'
+      UNION ALL SELECT count(1) cnt
+   FROM ent_raw.interface_base
+   WHERE topic = "ent_shh_interface"
+     AND dt = "{dt}"
+     AND GET_JSON_OBJECT(ori_json, "$.type")= "BIZR"
+     AND GET_JSON_OBJECT(ori_json, "$.source")= "BIZR"
+     )t
+     '''
+    return spark.query(sql)[0].collect()[0]['cnt']
+
+
+def get_shh_company_interface_cnt(dt):
+    sql = f'select count(1) cnt from ent_ods.ent_shh_api_company_logs where dt = "{dt}" and source != "SCRIPT"'
+    return spark.query(sql)[0].collect()[0]['cnt']
+
+
+def get_shh_company_interface_script_cnt(dt):
+    sql = f'select count(1) cnt from ent_ods.ent_shh_api_company_logs where dt = "{dt}" and source = "SCRIPT"'
+    return spark.query(sql)[0].collect()[0]['cnt']
+
+
+def get_shh_contact_interface_cnt(dt):
+    sql = f'select count(1) cnt from ent_raw.interface_base where topic = "ctc_shh_interface" and dt = "{dt}" and GET_JSON_OBJECT(ori_json, "$.source") != "SCRIPT"'
+    return spark.query(sql)[0].collect()[0]['cnt']
+
+
+def get_shh_contact_interface_script_cnt(dt):
+    sql = f'select count(1) cnt from ent_raw.interface_base where topic = "ctc_shh_interface" and dt = "{dt}" and GET_JSON_OBJECT(ori_json, "$.source") = "SCRIPT"'
+    return spark.query(sql)[0].collect()[0]['cnt']
+
+
+def get_snv_contact_interface_cnt(dt):
+    sql = f'select count(1) cnt from ent_raw.interface_base where topic = "ctc_snovio_interface" and dt = "{dt}" and GET_JSON_OBJECT(ori_json, "$.source") != "MANUAL_CONSUME" '
+    return spark.query(sql)[0].collect()[0]['cnt']
+
+
+def get_snv_contact_interface_script_cnt(dt):
+    sql = f'select count(1) cnt from ent_raw.interface_base where topic = "ctc_snovio_interface" and dt = "{dt}" and GET_JSON_OBJECT(ori_json, "$.source") = "MANUAL_CONSUME" '
+    return spark.query(sql)[0].collect()[0]['cnt']
+
+
+if __name__ == '__main__':
+    CONFIG, _ = parse_args(sys.argv[1:])
+    dts = CONFIG.get('dt').split(',')
+    for dt in dts:
+        format_dt = f'{dt[:4]}-{dt[4:6]}-{dt[6:]}'
+        shh_company_interface_cnt = get_shh_company_interface_cnt(dt)
+        shh_company_interface_script_cnt = get_shh_company_interface_script_cnt(dt)
+        shh_contact_interface_cnt = get_shh_contact_interface_cnt(dt)
+        shh_contact_interface_script_cnt = get_shh_contact_interface_script_cnt(dt)
+        snv_contact_interface_cnt = get_snv_contact_interface_cnt(dt)
+        snv_contact_interface_script_cnt = get_snv_contact_interface_script_cnt(dt)
+        shh_non_core_interface_cnt = get_shh_non_core_interface_cnt(dt)
+        msg = f'''【接口调用量统计】------------------------------------------
+统计日期: {format_dt}
+1、单接口调用公司信息次数: {shh_company_interface_cnt + shh_company_interface_script_cnt}
+①自然调用次数: {shh_company_interface_cnt}
+②脚本调用次数: {shh_company_interface_script_cnt}
+
+2、单接口调用联系人次数: {shh_contact_interface_cnt + shh_contact_interface_script_cnt}
+①自然调用次数: {shh_contact_interface_cnt}
+②脚本调用次数: {shh_contact_interface_script_cnt}
+
+3、snovio调用联系人次数: {snv_contact_interface_cnt + snv_contact_interface_script_cnt}
+①自然调用次数: {snv_contact_interface_cnt}
+②脚本调用次数: {snv_contact_interface_script_cnt}
+
+4、单接口非核心业务调用次数:{shh_non_core_interface_cnt}
+---------------------------------------------------------------'''
+        print(msg)
+        send_dingtalk_notification(msg)

+ 141 - 0
dw_base/scheduler/ent_interface_dingtalk_top10.py

@@ -0,0 +1,141 @@
+import base64
+import hashlib
+import hmac
+import sys
+import re
+import os
+import urllib
+import time
+import requests
+
+abspath = os.path.abspath(__file__)
+root_path = re.sub(r"tendata-warehouse.*", "tendata-warehouse", abspath)
+sys.path.append(root_path)
+from dw_base.utils.config_utils import parse_args
+from dw_base.spark.spark_sql import SparkSQL
+import http.client
+import json
+
+from cryptography.hazmat.primitives.asymmetric import rsa, padding
+from cryptography.hazmat.primitives import serialization
+from base64 import b64encode
+
+# 公钥
+public_key_pem = b"""
+-----BEGIN PUBLIC KEY-----
+MIGfMA0GCSqGSIb3DQEBAQUAA4GNADCBiQKBgQDSaL/mqfq/30d5w6/05EL4073z
+ZgsomKTDI9wKUyz+ETkGwWzaNQm8BAXk9nJMCPz25fCTPd2BkifrS2KFKK2+e4hU
+pQxs+FQGaSeR8YEBWsCwh8bWaFWgxKuWpPPdfP6Vcnid/pTAsjbnw0KIHT7x83WZ
+qQTu3GUdyXkfyB41CQIDAQAB
+-----END PUBLIC KEY-----
+"""
+
+
+class UserInfo:
+    """公司名称"""
+    company_name: str
+    """真实名称"""
+    name: str
+    """用户id"""
+    user_id: int
+    """用户名"""
+    username: str
+
+    def __init__(self, company_name: str, name: str, user_id: int, username: str) -> None:
+        self.company_name = company_name
+        self.name = name
+        self.user_id = user_id
+        self.username = username
+
+    def __str__(self) -> str:
+        return (f"UserInfo:\n"
+                f"  Company Name: {self.company_name}\n"
+                f"  Name: {self.name}\n"
+                f"  User ID: {self.user_id}\n"
+                f"  Username: {self.username}")
+
+
+def encrypt_user_id(user_id):
+    public_key = serialization.load_pem_public_key(public_key_pem)
+    encrypted = public_key.encrypt(
+        user_id.encode(),
+        padding.PKCS1v15()
+    )
+    return b64encode(encrypted).decode()
+
+
+def get_user_info(user_id):
+    encrypted_user_id = encrypt_user_id(user_id)
+    conn = http.client.HTTPConnection("192.168.11.6", 18080)
+    payload = json.dumps({
+        "encryptUserId": encrypted_user_id
+    })
+    headers = {
+        'User-Agent': 'Apifox/1.0.0 (https://apifox.com)',
+        'Content-Type': 'application/json'
+    }
+
+    try:
+        conn.request("POST", "/account/personal", payload, headers)
+        res = conn.getresponse()
+        resdata = res.read().decode("utf-8")
+        res_json = json.loads(resdata)
+        user_info = UserInfo(res_json['companyName'], res_json['name'], res_json['userId'], res_json['username'])
+        return user_info
+    except Exception as e:
+        print("Error:", e)
+    finally:
+        conn.close()
+
+
+spark = SparkSQL(udf_files=['dw_base/spark/udf/contacts/ctc_common.py',
+                            'dw_base/spark/udf/spark_id_generate_udf.py'])
+
+
+def get_sign(secret):
+    timestamp = str(round(time.time() * 1000))
+    secret_enc = secret.encode('utf-8')
+    string_to_sign = '{}\n{}'.format(timestamp, secret)
+    string_to_sign_enc = string_to_sign.encode('utf-8')
+    hmac_code = hmac.new(secret_enc, string_to_sign_enc, digestmod=hashlib.sha256).digest()
+    sign = urllib.parse.quote_plus(base64.b64encode(hmac_code))
+    return timestamp, sign
+
+
+def send_dingtalk_markdown(msg):
+    headers = {"Content-Type": "application/json"}
+    data = {
+        "msgtype": "markdown",
+        "markdown": {"title": '企业库告警', "text": msg, }
+    }
+    json_data = json.dumps(data)
+    secret = 'SECffb7fe1b4c3aacc7be85d3b03de88fdbf93dfb48fe1c13ea7dba34a84847675e'
+    timestamp, sign = get_sign(secret)
+    url = f'https://oapi.dingtalk.com/robot/send?access_token=ffdb7df856220a925196e911107a4aa259acb2fd1160fee8b11d0c3c800974fc&timestamp={timestamp}&sign={sign}'
+    response = requests.post(url=url, data=json_data, headers=headers)
+    response.raise_for_status()
+
+def ent_user_top(dt):
+    sql = (f"select GET_JSON_OBJECT(ori_json, '$.params.userId') as  user ,count(1) as cnt from ent_raw.interface_base "
+           f"where dt='{dt}' and  topic = 'ent_tendata_interface' and GET_JSON_OBJECT(ori_json, '$.type') = 'BRIEF_RESULT' group by GET_JSON_OBJECT(ori_json, '$.params.userId') order by count(1) desc limit 10"
+           )
+    body = ''
+    for row in spark.query(sql)[0].collect():
+        userid = row.user
+        user_info = get_user_info(userid)
+        body += f'{user_info.username},{user_info.name},{user_info.company_name},**{row.cnt}**次 \n\n'
+    return body
+
+if __name__ == '__main__':
+    CONFIG, _ = parse_args(sys.argv[1:])
+    dts = CONFIG.get('dt').split(',')
+    for dt in dts:
+        format_dt = f'{dt[:4]}-{dt[4:6]}-{dt[6:]}'
+        ent_user_top_cnt = ent_user_top(dt)
+        msg = f'''### 企业主页接口调用统计top10
+> **统计日期 :  {format_dt}**
+
+{ent_user_top_cnt}
+                '''
+        print(msg)
+        send_dingtalk_markdown(msg)

+ 242 - 0
dw_base/scheduler/ent_interface_dingtalk_update.py

@@ -0,0 +1,242 @@
+import sys
+import re
+import os
+import requests
+
+abspath = os.path.abspath(__file__)
+root_path = re.sub(r"tendata-warehouse.*", "tendata-warehouse", abspath)
+sys.path.append(root_path)
+from dw_base.utils.config_utils import parse_args
+from dw_base.spark.spark_sql import SparkSQL
+import json
+
+spark = SparkSQL(udf_files=['dw_base/spark/udf/contacts/ctc_common.py',
+                            'dw_base/spark/udf/spark_id_generate_udf.py'],
+                 extra_spark_config={'spark.sql.crossJoin.enabled': True})
+
+
+def send_dingtalk_notification(msg):
+    headers = {"Content-Type": "application/json"}
+    data = {
+        "msgtype": "text",
+        "text": {"content": msg}
+    }
+    json_data = json.dumps(data)
+    # 企业库数据产品线
+    url = 'https://oapi.dingtalk.com/robot/send?access_token=c4086d8ba377fdade2dff869e71063733095bc718d3bafdfbe8be0966aa050d6'
+    # 企业库管理群
+    # url = 'https://oapi.dingtalk.com/robot/send?access_token=5183dfe1ecbe06261bcac7b45c1a6b5ae101fec67877d74120a6a95c88d1f917'
+    # url = 'https://oapi.dingtalk.com/robot/send?access_token=c4086d8ba377fdade2dff869e71063733095bc718d3bafdfbe8be0966aa050d6'
+    # 企业&联系人机器人测试群
+    # url = 'https://oapi.dingtalk.com/robot/send?access_token=bee997dbf61e839a17de087830ffef6e864c3109fef62a956703bdfe043b0e10'
+    response = requests.post(url=url, data=json_data, headers=headers)
+    response.raise_for_status()
+
+
+def get_base_cnt(dt, trigger_type):
+    sql = f'''
+SELECT count(DISTINCT user_id) as user_cnt, count(distinct trace_id) as trace_cnt
+FROM (SELECT user_id, trace_id
+      FROM ctc_ods.ctc_shh_interface_log
+      WHERE dt = '{dt}'
+        AND trigger_type = '{trigger_type}'
+      UNION ALL
+      SELECT user_id, trace_id
+      FROM ctc_ods.ctc_snv_interface_log
+      WHERE dt = '{dt}'
+        AND trigger_type = '{trigger_type}'
+      UNION ALL
+      SELECT user_id, trace_id
+      FROM ctc_ods.ctc_google_interface_log
+      WHERE dt = '{dt}'
+        AND trigger_type = '{trigger_type}') t
+    '''
+    return spark.query(sql)[0].collect()[0]
+
+
+def get_web_cnt(dt, trigger_type):
+    sql = f'''
+select count(1)                                                                              as request_web_cnt,
+       nvl(sum(if(get_json_object(ori_json, '$.result.data.website') is not null, 1, 0)), 0) as get_web_cnt
+from ent_raw.interface_base
+where topic = 'ent_tendata_interface'
+  and dt = '{dt}'
+  and get_json_object(ori_json, '$.source') = 'BING'
+  and get_json_object(ori_json, '$.type') = '{trigger_type}'
+    '''
+    return spark.query(sql)[0].collect()[0]
+
+
+def get_auto_user_cnt(dt):
+    sql = f'''
+      SELECT 
+       count(DISTINCT get_json_object(ori_json, '$.params.userId')) AS request_user_cnt
+FROM ent_raw.interface_base
+WHERE topic = 'ent_tendata_interface'
+  AND dt = '{dt}'
+  AND get_json_object(ori_json, '$.source') = 'BING'
+  AND get_json_object(ori_json, '$.type') = 'AUTO'
+    '''
+    return spark.query(sql)[0].collect()[0]['request_user_cnt']
+
+
+def get_auto_source_cnt(dt):
+    sql = f'''
+    SELECT *
+from (select count(distinct trace_id) as shh_cnt
+      FROM ctc_ods.ctc_shh_interface_log
+      WHERE dt = '{dt}'
+        AND trigger_type = 'AUTO') shh
+         join (SELECT count(distinct trace_id) as snv_cnt
+               FROM ctc_ods.ctc_snv_interface_log
+               WHERE dt = '{dt}'
+                 AND trigger_type = 'AUTO') snv
+         join (SELECT count(distinct trace_id) as ggl_cnt
+               FROM ctc_ods.ctc_google_interface_log
+               WHERE dt = '{dt}'
+                 AND trigger_type = 'AUTO') ggl
+        '''
+    return spark.query(sql)[0].collect()[0]
+
+def get_res_cnt(dt, trigger_type):
+    sql = f'''
+with init as (select ti,
+                     if(source like '%shh_%', 1, 0)   as shh_flag,
+                     if(source like '%snovio%', 1, 0) as snv_flag,
+                     if(source like '%google%', 1, 0) as ggl_flag
+              from ctc_mid.ctc_main_pre
+                       LATERAL VIEW explode(trace_id) exploded_table1 AS ti
+              where dt = '{dt}'
+                and array_contains(trigger_type, '{trigger_type}')),
+     flag as (select ti
+                   , if(sum(shh_flag) > 0, 1, 0) as shh_get_flag
+                   , if(sum(snv_flag) > 0, 1, 0) as snv_get_flag
+                   , if(sum(ggl_flag) > 0, 1, 0) as ggl_get_flag
+              from init
+              group by ti)
+select nvl(sum(shh_get_flag),0) as shh_get_cnt
+     , nvl(sum(snv_get_flag),0) as snv_get_cnt
+     , nvl(sum(ggl_get_flag),0) as ggl_get_cnt
+     , count(ti)         as all_get_cnt
+from flag
+        '''
+    return spark.query(sql)[0].collect()[0]
+
+
+def get_ctc_cnt(dt, trigger_type):
+    sql = f'''
+select nvl(sum(if(source like '%shh_%', 1, 0)), 0)   as shh_ctc_cnt,
+       nvl(sum(if(source like '%snovio%', 1, 0)), 0) as snv_ctc_cnt,
+       nvl(sum(if(source like '%google%', 1, 0)), 0) as ggl_ctc_cnt
+from ctc_mid.ctc_main_pre
+where dt = '{dt}'
+  and array_contains(trigger_type, '{trigger_type}')
+            '''
+    return spark.query(sql)[0].collect()[0]
+
+
+if __name__ == '__main__':
+    CONFIG, _ = parse_args(sys.argv[1:])
+    dts = CONFIG.get('dt').split(',')
+    for dt in dts:
+        format_dt = f'{dt[:4]}-{dt[4:6]}-{dt[6:]}'
+
+        manual_base_cnt = get_base_cnt(dt, 'MANUAL')
+        manual_web_cnt = get_web_cnt(dt, 'MANUAL')
+        manual_res_cnt = get_res_cnt(dt, 'MANUAL')
+        manual_ctc_cnt = get_ctc_cnt(dt, 'MANUAL')
+
+        manual_user_cnt = manual_base_cnt['user_cnt']
+        manual_trace_cnt = manual_base_cnt['trace_cnt']
+        manual_trace_avg = manual_trace_cnt / manual_user_cnt if manual_user_cnt > 0 else 0
+
+        manual_web_request_cnt = manual_web_cnt['request_web_cnt']
+        manual_web_get_cnt = manual_web_cnt['get_web_cnt']
+        manual_web_get_pct = 100 * manual_web_get_cnt / manual_web_request_cnt if manual_web_request_cnt > 0 else 0
+
+        manual_shh_get_cnt = manual_res_cnt['shh_get_cnt']
+        manual_shh_get_pct = 100 * manual_shh_get_cnt / manual_trace_cnt if manual_trace_cnt > 0 else 0
+        manual_snv_get_cnt = manual_res_cnt['snv_get_cnt']
+        manual_snv_get_pct = 100 * manual_snv_get_cnt / manual_trace_cnt if manual_trace_cnt > 0 else 0
+        manual_ggl_get_cnt = manual_res_cnt['ggl_get_cnt']
+        manual_ggl_get_pct = 100 * manual_ggl_get_cnt / manual_trace_cnt if manual_trace_cnt > 0 else 0
+        manual_all_get_cnt = manual_res_cnt['all_get_cnt']
+        manual_all_get_pct = 100 * manual_all_get_cnt / manual_trace_cnt if manual_trace_cnt > 0 else 0
+
+        manual_ctc_shh_cnt = manual_ctc_cnt['shh_ctc_cnt']
+        manual_ctc_snv_cnt = manual_ctc_cnt['snv_ctc_cnt']
+        manual_ctc_ggl_cnt = manual_ctc_cnt['ggl_ctc_cnt']
+        ############################################################
+        auto_base_cnt = get_base_cnt(dt, 'AUTO')
+        auto_web_cnt = get_web_cnt(dt, 'AUTO')
+        auto_res_cnt = get_res_cnt(dt, 'AUTO')
+        auto_ctc_cnt = get_ctc_cnt(dt, 'AUTO')
+
+        auto_user_cnt = auto_base_cnt['user_cnt']
+        auto_trace_cnt = auto_base_cnt['trace_cnt']
+        auto_trace_avg = auto_trace_cnt / auto_user_cnt if auto_user_cnt > 0 else 0
+
+        auto_web_request_cnt = auto_web_cnt['request_web_cnt']
+        auto_web_get_cnt = auto_web_cnt['get_web_cnt']
+        auto_web_get_pct = 100 * auto_web_get_cnt / auto_web_request_cnt if auto_web_request_cnt > 0 else 0
+
+
+        auto_source_cnt = get_auto_source_cnt(dt)
+        auto_request_shh_cnt = auto_source_cnt['shh_cnt']
+        auto_request_snv_cnt = auto_source_cnt['snv_cnt']
+        auto_request_ggl_cnt = auto_source_cnt['ggl_cnt']
+
+        auto_shh_get_cnt = auto_res_cnt['shh_get_cnt']
+        auto_shh_get_pct = 100 * auto_shh_get_cnt / auto_request_shh_cnt if auto_request_shh_cnt > 0 else 0
+        auto_snv_get_cnt = auto_res_cnt['snv_get_cnt']
+        auto_snv_get_pct = 100 * auto_snv_get_cnt / auto_request_snv_cnt if auto_request_snv_cnt > 0 else 0
+        auto_ggl_get_cnt = auto_res_cnt['ggl_get_cnt']
+        auto_ggl_get_pct = 100 * auto_ggl_get_cnt / auto_request_ggl_cnt if auto_request_ggl_cnt > 0 else 0
+        auto_all_get_cnt = auto_res_cnt['all_get_cnt']
+        auto_all_get_pct = 100 * auto_all_get_cnt / auto_trace_cnt if auto_trace_cnt > 0 else 0
+        auto_all_get_pct = 100 * auto_all_get_cnt / auto_trace_cnt if auto_trace_cnt > 0 else 0
+
+        auto_ctc_shh_cnt = auto_ctc_cnt['shh_ctc_cnt']
+        auto_ctc_snv_cnt = auto_ctc_cnt['snv_ctc_cnt']
+        auto_ctc_ggl_cnt = auto_ctc_cnt['ggl_ctc_cnt']
+
+        auto_user_cnt = get_auto_user_cnt(dt)
+
+        msg = f'''【手动/自动更新效果统计】------------------------------------------
+统计日期: {format_dt}
+1、手动更新
+①手动更新请求总人数:{manual_user_cnt}人
+②手动更新请求总次数:{manual_trace_cnt}次
+③人均请求次数:{manual_trace_avg:.2f}次
+④手动请求bing网址总次数:{manual_web_request_cnt}次
+⑤bing获取到网址的次数及占比:{manual_web_get_cnt}次,{manual_web_get_pct:.2f}%
+⑥单接口获取到联系人次数及占比:{manual_shh_get_cnt}次,{manual_shh_get_pct:.2f}%
+⑦单接口获取到联系人去重总数:{manual_ctc_shh_cnt}
+⑧snovio接口获取到联系人次数及占比:{manual_snv_get_cnt}次,{manual_snv_get_pct:.2f}%
+⑨snovio接口获取到联系人去重总数:{manual_ctc_snv_cnt}
+⑩google爬虫获取到联系人次数及占比:{manual_ggl_get_cnt}次,{manual_ggl_get_pct:.2f}%
+⑪google爬虫获取到联系人去重总数:{manual_ctc_ggl_cnt}
+⑫当日手动更新获得联系方式的总次数:{manual_all_get_cnt}  
+⑬当日手动更新解决联系人问题的百分比:{manual_all_get_pct:.2f}%
+
+2、自动更新
+① 自动更新请求总人数:{auto_user_cnt}人
+② 自动更新请求总次数:{auto_web_request_cnt}次
+③ 人均请求次数:{auto_trace_avg:.2f}次
+④ 自动请求bing网址总次数:{auto_web_request_cnt}次
+⑤ bing获取到网址的次数及占比:{auto_web_get_cnt}次,{auto_web_get_pct:.2f}%
+⑥ 自动请求单接口的总次数:{auto_request_shh_cnt} 次
+⑦ 单接口获取到联系人次数及占比:{auto_shh_get_cnt}次,{auto_shh_get_pct:.2f}%
+⑧ 单接口获取到联系人去重总数:{auto_ctc_shh_cnt}
+⑨ 自动请求snovio接口的总次数:{auto_request_snv_cnt} 次
+⑩ snovio接口获取到联系人次数及占比:{auto_snv_get_cnt}次,{auto_snv_get_pct:.2f}%
+⑪ snovio接口获取到联系人去重总数:{auto_ctc_snv_cnt}
+⑫ 自动请求google爬虫的总次数:{auto_request_ggl_cnt} 次
+⑬ google爬虫获取到联系人次数及占比:{auto_ggl_get_cnt}次,{auto_ggl_get_pct:.2f}%
+⑭ google爬虫获取到联系人去重总数:{auto_ctc_ggl_cnt}
+⑮ 当日自动更新请求联系方式的总次数:{auto_trace_cnt}
+⑯ 当日自动更新获得联系方式的总次数:{auto_all_get_cnt}
+⑰ 当日自动更新解决联系人问题的百分比:{auto_all_get_pct:.2f}%
+--------------------------------------------------------------- '''
+        print(msg)
+        send_dingtalk_notification(msg)

+ 185 - 0
dw_base/scheduler/get_oldmongo_cjfs.py

@@ -0,0 +1,185 @@
+import argparse
+import sys
+import re
+import os
+from pyhive import hive
+import pandas as pd
+
+abspath = os.path.abspath(__file__)
+root_path = re.sub(r"tendata-warehouse.*", "tendata-warehouse", abspath)
+sys.path.append(root_path)
+from dw_base.utils.log_utils import pretty_print
+from configparser import ConfigParser
+from pymongo import MongoClient
+from dw_base import *
+from dw_base.scheduler.polling_scheduler import get_mongo_client
+
+
+
+# 定义一个数组(列表)
+my_array = [
+    'america_stat',
+    'australia',
+    'brazil',
+    'brazil_stat',
+    'canada',
+    'canada_stat',
+    'china_stat',
+    'cis',
+    'dominica',
+    'england',
+    'ethiopia',
+    'eurasian_bol',
+    'european_union',
+    'fiji',
+    'guatemala',
+    'honduras',
+    'honduras_stat',
+    'hongkong_stat',
+    'indonesia_stat',
+    'japan',
+    'kyrghyzstan',
+    'new_zealand',
+    'nicaragua',
+    'peru_exp',
+    'philippines_stat',
+    'russia_rail',
+    'salvador',
+    'salvador_stat',
+    'south_africa_stat',
+    'south_korea',
+    'south_korea_stat',
+    'spain',
+    'taiwan',
+    'thailand',
+    'thailand_stat',
+    'turkey_stat',
+    'zimbabwe',
+    'taiwan_stat',
+    'tanzania',
+    'tanzania_tboe',
+    'bolivia_stat',
+    'spain_co',
+    'congo_kinshasa',
+    'south_korea_co',
+    'england_stat',
+    'angola_stat',
+    'guatemala_stat',
+    'brazil_air',
+    'egypt_co',
+    'uruguay_nboe',
+    'panama_exp',
+    'bahrain_stat',
+    'dominican_republic_stat',
+    'qatar_stat'
+]
+
+
+def parse_arguments():
+    # 创建 ArgumentParser 对象
+    parser = argparse.ArgumentParser(description='Process some parameters.')
+
+    # 添加参数
+    parser.add_argument('-mgdb', dest='mgdb', required=True, help='Parameter 1')
+
+    # 解析参数
+    return parser.parse_args()
+
+def get_mongo_client(conf_path):
+    config_parser = ConfigParser()
+    config_parser.read(root_path + conf_path)
+    url = config_parser.get('base', 'address')
+    return MongoClient(url)
+
+
+
+def get_count(client, mgdb):
+    # 选择数据库
+    db = client[mgdb]
+    # 选择集合
+    collection1 = db['shipments_imports']
+    collection2 = db['shipments_exports']
+
+    # 使用聚合管道进行分组和计数
+    pipeline = [
+        {
+            "$group": {
+                "_id": "$cjfs",  # 按cjfs字段分组
+                "count": {"$sum": 1}  # 计算每个组的数量
+            }
+        }
+    ]
+
+    # 执行聚合查询
+    results1 = list(collection1.aggregate(pipeline))
+    results2 = list(collection2.aggregate(pipeline))
+
+    pretty_print(f'开始合并结果-------------------------------------------------------------------------')
+
+    # 合并结果
+    combined_results = list(results1) + list(results2)
+
+
+    # 假设 combined_results 是一个字典列表
+    # 将结果转换为 DataFrame
+    df = pd.DataFrame(combined_results)
+    df1 = pd.DataFrame(results1)
+    df2 = pd.DataFrame(results2)
+
+    # 连接到 Hive
+    hive_conn = hive.Connection(host='192.168.30.3', port=10000, username='hive', database='dim')
+
+    # 写入 Hive 表
+    cursor = hive_conn.cursor()
+
+    pretty_print(f'开始插入结果-------------------------------------------------------------------------')
+
+    # 插入数据
+    for index, row in df1.iterrows():
+        insert_query = f"""
+        INSERT INTO dim.cts_cjfs_global_old (cjfs, cnt, gj, jck)
+        VALUES ('{row['_id']}' , '{row['count']}','{mgdb}', 'im')
+        """
+        pretty_print(f'{insert_query}')
+        cursor.execute(insert_query)
+    # 插入数据
+    for index, row in df2.iterrows():
+        insert_query = f"""
+        INSERT INTO dim.cts_cjfs_global_old (cjfs, cnt, gj, jck)
+        VALUES ('{row['_id']}' , '{row['count']}','{mgdb}', 'ex')
+        """
+        cursor.execute(insert_query)
+
+    # 关闭连接
+    cursor.close()
+    hive_conn.close()
+
+    jgj = ('----------------------'+
+           '\n结果1-->' + str(results1) +
+           '结果1end\n结果2-->' + str(results2) +
+           '结果2end\n合并后结果-->'+str(combined_results)+
+           '\n----------------------'
+    )
+    pretty_print(f'{jgj}')
+    return jgj
+
+def get_old_count(client,mgdb):
+    result = get_count(client, mgdb)
+    pretty_print(f'{NORM_MGT} old source mongo: {NORM_GRN}{mgdb} '
+                 f'{NORM_MGT} old data count: {NORM_GRN}')
+    return result
+
+
+def main():
+    client = get_mongo_client('/../datasource/mongo/mongo-cts-prod-old.ini')
+    pretty_print(f'开始循环调用-------------------------------------------------------------------------')
+    pretty_print(f'{my_array}')
+    # 使用for循环遍历数组,并调用函数
+    for item in my_array:
+        pretty_print(f'开始执行:{item}')
+        get_old_count(client,item)
+    client.close()
+    return 0
+
+if __name__ == '__main__':
+    main()

+ 185 - 0
dw_base/scheduler/get_oldmongo_sldw.py

@@ -0,0 +1,185 @@
+import argparse
+import sys
+import re
+import os
+from pyhive import hive
+import pandas as pd
+
+abspath = os.path.abspath(__file__)
+root_path = re.sub(r"tendata-warehouse.*", "tendata-warehouse", abspath)
+sys.path.append(root_path)
+from dw_base.utils.log_utils import pretty_print
+from configparser import ConfigParser
+from pymongo import MongoClient
+from dw_base import *
+from dw_base.scheduler.polling_scheduler import get_mongo_client
+
+
+
+# 定义一个数组(列表)
+my_array = [
+    'america_stat',
+    'australia',
+    'brazil',
+    'brazil_stat',
+    'canada',
+    'canada_stat',
+    'china_stat',
+    'cis',
+    'dominica',
+    'england',
+    'ethiopia',
+    'eurasian_bol',
+    'european_union',
+    'fiji',
+    'guatemala',
+    'honduras',
+    'honduras_stat',
+    'hongkong_stat',
+    'indonesia_stat',
+    'japan',
+    'kyrghyzstan',
+    'new_zealand',
+    'nicaragua',
+    'peru_exp',
+    'philippines_stat',
+    'russia_rail',
+    'salvador',
+    'salvador_stat',
+    'south_africa_stat',
+    'south_korea',
+    'south_korea_stat',
+    'spain',
+    'taiwan',
+    'thailand',
+    'thailand_stat',
+    'turkey_stat',
+    'zimbabwe',
+    'taiwan_stat',
+    'tanzania',
+    'tanzania_tboe',
+    'bolivia_stat',
+    'spain_co',
+    'congo_kinshasa',
+    'south_korea_co',
+    'england_stat',
+    'angola_stat',
+    'guatemala_stat',
+    'brazil_air',
+    'egypt_co',
+    'uruguay_nboe',
+    'panama_exp',
+    'bahrain_stat',
+    'dominican_republic_stat',
+    'qatar_stat'
+]
+
+
+def parse_arguments():
+    # 创建 ArgumentParser 对象
+    parser = argparse.ArgumentParser(description='Process some parameters.')
+
+    # 添加参数
+    parser.add_argument('-mgdb', dest='mgdb', required=True, help='Parameter 1')
+
+    # 解析参数
+    return parser.parse_args()
+
+def get_mongo_client(conf_path):
+    config_parser = ConfigParser()
+    config_parser.read(root_path + conf_path)
+    url = config_parser.get('base', 'address')
+    return MongoClient(url)
+
+
+
+def get_count(client, mgdb):
+    # 选择数据库
+    db = client[mgdb]
+    # 选择集合
+    collection1 = db['shipments_imports']
+    collection2 = db['shipments_exports']
+
+    # 使用聚合管道进行分组和计数
+    pipeline = [
+        {
+            "$group": {
+                "_id": "$sldw",  # 按sldw字段分组
+                "count": {"$sum": 1}  # 计算每个组的数量
+            }
+        }
+    ]
+
+    # 执行聚合查询
+    results1 = list(collection1.aggregate(pipeline))
+    results2 = list(collection2.aggregate(pipeline))
+
+    pretty_print(f'开始合并结果-------------------------------------------------------------------------')
+
+    # 合并结果
+    combined_results = list(results1) + list(results2)
+
+
+    # 假设 combined_results 是一个字典列表
+    # 将结果转换为 DataFrame
+    df = pd.DataFrame(combined_results)
+    df1 = pd.DataFrame(results1)
+    df2 = pd.DataFrame(results2)
+
+    # 连接到 Hive
+    hive_conn = hive.Connection(host='192.168.30.3', port=10000, username='hive', database='dim')
+
+    # 写入 Hive 表
+    cursor = hive_conn.cursor()
+
+    pretty_print(f'开始插入结果-------------------------------------------------------------------------')
+
+    # 插入数据
+    for index, row in df1.iterrows():
+        insert_query = f"""
+        INSERT INTO dim.cts_sldw_global_old (sldw, cnt, gj, jck)
+        VALUES ('{row['_id']}' , '{row['count']}','{mgdb}', 'im')
+        """
+        pretty_print(f'{insert_query}')
+        cursor.execute(insert_query)
+    # 插入数据
+    for index, row in df2.iterrows():
+        insert_query = f"""
+        INSERT INTO dim.cts_sldw_global_old (sldw, cnt, gj, jck)
+        VALUES ('{row['_id']}' , '{row['count']}','{mgdb}', 'ex')
+        """
+        cursor.execute(insert_query)
+
+    # 关闭连接
+    cursor.close()
+    hive_conn.close()
+
+    jgj = ('----------------------'+
+           '\n结果1-->' + str(results1) +
+           '结果1end\n结果2-->' + str(results2) +
+           '结果2end\n合并后结果-->'+str(combined_results)+
+           '\n----------------------'
+    )
+    pretty_print(f'{jgj}')
+    return jgj
+
+def get_old_count(client,mgdb):
+    result = get_count(client, mgdb)
+    pretty_print(f'{NORM_MGT} old source mongo: {NORM_GRN}{mgdb} '
+                 f'{NORM_MGT} old data count: {NORM_GRN}')
+    return result
+
+
+def main():
+    client = get_mongo_client('/../datasource/mongo/mongo-cts-prod-old.ini')
+    pretty_print(f'开始循环调用-------------------------------------------------------------------------')
+    pretty_print(f'{my_array}')
+    # 使用for循环遍历数组,并调用函数
+    for item in my_array:
+        pretty_print(f'开始执行:{item}')
+        get_old_count(client,item)
+    client.close()
+    return 0
+
+if __name__ == '__main__':
+    main()

+ 90 - 0
dw_base/scheduler/get_oldmongo_sldw_detail.py

@@ -0,0 +1,90 @@
+import argparse
+import sys
+import re
+import os
+from pyhive import hive
+import pandas as pd
+
+abspath = os.path.abspath(__file__)
+root_path = re.sub(r"tendata-warehouse.*", "tendata-warehouse", abspath)
+sys.path.append(root_path)
+from dw_base.utils.log_utils import pretty_print
+from configparser import ConfigParser
+from pymongo import MongoClient
+from dw_base import *
+from dw_base.scheduler.polling_scheduler import get_mongo_client
+
+
+
+# 定义一个数组(列表)
+my_array = [
+    'japan'
+]
+
+
+def parse_arguments():
+    # 创建 ArgumentParser 对象
+    parser = argparse.ArgumentParser(description='Process some parameters.')
+
+    # 添加参数
+    parser.add_argument('-mgdb', dest='mgdb', required=True, help='Parameter 1')
+
+    # 解析参数
+    return parser.parse_args()
+
+def get_mongo_client(conf_path):
+    config_parser = ConfigParser()
+    config_parser.read(root_path + conf_path)
+    url = config_parser.get('base', 'address')
+    return MongoClient(url)
+
+
+
+def get_count(client, mgdb):
+    # 选择数据库
+    db = client[mgdb]
+    # 选择集合
+    # collection1 = db['shipments_imports']
+    collection2 = db['shipments_exports']
+
+    # 使用聚合管道进行分组和计数
+    pipeline = [
+        {
+            "$group": {
+                "_id": "$sldw",  # 按sldw字段分组
+                "count": {"$sum": 1} , # 计算每个组的数量
+                "maxid": {"$max": "$_id"} # 计算每个组的最大id值
+            }
+        }
+    ]
+
+    # 执行聚合查询
+    # results1 = list(collection1.aggregate(pipeline))
+    results2 = list(collection2.aggregate(pipeline))
+
+    pretty_print(f'开始合并结果-------------------------------------------------------------------------')
+
+    # 合并结果
+    combined_results =  list(results2)
+    pretty_print(f'结果-------------------------------------------------------------------------{combined_results}')
+
+def get_old_count(client,mgdb):
+    result = get_count(client, mgdb)
+    pretty_print(f'{NORM_MGT} old source mongo: {NORM_GRN}{mgdb} '
+                 f'{NORM_MGT} old data count: {NORM_GRN}')
+    return result
+
+
+def main():
+    client = get_mongo_client('/../datasource/mongo/mongo-cts-prod-old.ini')
+    pretty_print(f'开始循环调用-------------------------------------------------------------------------')
+    pretty_print(f'{my_array}')
+    # 使用for循环遍历数组,并调用函数
+    for item in my_array:
+        pretty_print(f'开始执行:{item}')
+        get_old_count(client,item)
+    client.close()
+    return 0
+
+if __name__ == '__main__':
+    main()

+ 102 - 0
dw_base/scheduler/get_oldmongo_stat.py

@@ -0,0 +1,102 @@
+# 用于钉钉监控T+1任务是否需要重跑
+import argparse
+import sys
+import re
+import os
+import requests
+import json
+
+abspath = os.path.abspath(__file__)
+root_path = re.sub(r"tendata-warehouse.*", "tendata-warehouse", abspath)
+sys.path.append(root_path)
+from dw_base.spark.spark_sql import SparkSQL
+from dw_base.utils.log_utils import pretty_print
+from configparser import ConfigParser
+from datetime import time
+from pymongo import MongoClient
+from dw_base import *
+from dw_base.scheduler.polling_scheduler import get_mongo_client
+from dw_base.utils.config_utils import parse_args
+from dw_base.scheduler.mg2es.conf_reader import ConfReader
+from dw_base.scheduler.mg2es.es_operator import ESOperator
+from elasticsearch.exceptions import NotFoundError
+
+
+# sql = "SELECT mgdb, mgtbl_name FROM tmp.tmp_zjh_1011"
+# spark = SparkSQL()
+# res = spark.query(sql)[0].collect()
+
+def parse_arguments():
+    # 创建 ArgumentParser 对象
+    parser = argparse.ArgumentParser(description='Process some parameters.')
+
+    # 添加参数
+    parser.add_argument('-mgdb', dest='mgdb', required=True, help='Parameter 1')
+
+    # 解析参数
+    return parser.parse_args()
+
+def get_mongo_client(conf_path):
+    config_parser = ConfigParser()
+    config_parser.read(root_path + conf_path)
+    url = config_parser.get('base', 'address')
+    return MongoClient(url)
+
+
+
+def get_count(client, mgdb):
+    # 选择数据库
+    db = client[mgdb]
+    # 选择集合
+    collection1 = db['shipments_imports']
+    collection2 = db['shipments_exports']
+    # 根据 mgtbl 确定字段名
+    fields_name1 =  'jksmc'
+    fields_name2 =  'cksmc'
+
+    # # 获取字段名
+    # field_name = fields_name.get(mgtbl)
+    # if field_name is None:
+    #     # 如果集合名不存在,抛出 ValueError 异常
+    #     raise ValueError(f"No field name found for mgtbl: {mgtbl}")
+    # 使用 distinct 方法获取字段的去重后个数
+
+    data1 = collection1.distinct(fields_name1)
+    data2 = collection2.distinct(fields_name2)
+    stat1 = len(data1)
+    stat2 = len(data2)
+    cnt1 = collection1.count()
+    cnt2 = collection2.count()
+    combined_data = len(set(data1 + data2))
+    jgj = ('----------------------'+
+           '\n进口总条数-->'+str(cnt1)+
+           ',\n出口总条数-->'+str(cnt2)+
+           ',\n进口去重企业数-->'+str(stat1)+
+           ',\n出口去重企业数-->'+str(stat2)+
+           ',\n进出口去重企业数-->'+str(combined_data)+
+           '\n----------------------'
+    )
+    pretty_print(f'{jgj}')
+    return jgj
+
+def get_old_count(mgdb):
+    client = get_mongo_client('/../datasource/mongo/mongo-cts-prod-old.ini')
+    result = get_count(client, mgdb)
+    pretty_print(f'{NORM_MGT} old source mongo: {NORM_GRN}{mgdb} '
+                 f'{NORM_MGT} old data count: {NORM_GRN}')
+    return result
+
+
+def main():
+    # CONFIG, _ = parse_args(sys.argv[1:])
+    # for record in res:
+    # mgtbl = record['mgtbl_name']
+
+    # 解析命令行参数
+    args = parse_arguments()
+    mgdb = args.mgdb
+    old_cnt = get_old_count(mgdb)
+    return 0
+
+if __name__ == '__main__':
+    main()

+ 139 - 0
dw_base/scheduler/get_oldmongo_ysfs.py

@@ -0,0 +1,139 @@
+# 用于钉钉监控T+1任务是否需要重跑
+import argparse
+import sys
+import re
+import os
+import requests
+import json
+from pyhive import hive
+import pandas as pd
+
+abspath = os.path.abspath(__file__)
+root_path = re.sub(r"tendata-warehouse.*", "tendata-warehouse", abspath)
+sys.path.append(root_path)
+from dw_base.spark.spark_sql import SparkSQL
+from dw_base.utils.log_utils import pretty_print
+from configparser import ConfigParser
+from datetime import time
+from pymongo import MongoClient
+from dw_base import *
+from dw_base.scheduler.polling_scheduler import get_mongo_client
+from dw_base.utils.config_utils import parse_args
+from dw_base.scheduler.mg2es.conf_reader import ConfReader
+from dw_base.scheduler.mg2es.es_operator import ESOperator
+from elasticsearch.exceptions import NotFoundError
+
+
+# sql = "SELECT mgdb, mgtbl_name FROM tmp.tmp_zjh_1011"
+# spark = SparkSQL()
+# res = spark.query(sql)[0].collect()
+
+def parse_arguments():
+    # 创建 ArgumentParser 对象
+    parser = argparse.ArgumentParser(description='Process some parameters.')
+
+    # 添加参数
+    parser.add_argument('-mgdb', dest='mgdb', required=True, help='Parameter 1')
+
+    # 解析参数
+    return parser.parse_args()
+
+def get_mongo_client(conf_path):
+    config_parser = ConfigParser()
+    config_parser.read(root_path + conf_path)
+    url = config_parser.get('base', 'address')
+    return MongoClient(url)
+
+
+
+def get_count(client, mgdb):
+    # 选择数据库
+    db = client[mgdb]
+    # 选择集合
+    collection1 = db['shipments_imports']
+    collection2 = db['shipments_exports']
+
+    # 使用聚合管道进行分组和计数
+    pipeline = [
+        {
+            "$group": {
+                "_id": "$ysfs",  # 按ysfs字段分组
+                "count": {"$sum": 1}  # 计算每个组的数量
+            }
+        }
+    ]
+
+    # 执行聚合查询
+    results1 = list(collection1.aggregate(pipeline))
+    results2 = list(collection2.aggregate(pipeline))
+
+    pretty_print(f'开始合并结果-------------------------------------------------------------------------')
+
+    # 合并结果
+    combined_results = list(results1) + list(results2)
+
+
+    # 假设 combined_results 是一个字典列表
+    # 将结果转换为 DataFrame
+    df = pd.DataFrame(combined_results)
+    df1 = pd.DataFrame(results1)
+    df2 = pd.DataFrame(results2)
+
+    # 连接到 Hive
+    hive_conn = hive.Connection(host='192.168.30.3', port=10000, username='hive', database='dim')
+
+    # 写入 Hive 表
+    cursor = hive_conn.cursor()
+
+    pretty_print(f'开始插入结果-------------------------------------------------------------------------')
+
+    # 插入数据
+    for index, row in df1.iterrows():
+        insert_query = f"""
+        INSERT INTO dim.cts_ysfs_global_old (ysfs, cnt, gj, jck)
+        VALUES ('{row['_id']}' , '{row['count']}','{mgdb}', 'im')
+        """
+        pretty_print(f'{insert_query}')
+        cursor.execute(insert_query)
+    # 插入数据
+    for index, row in df2.iterrows():
+        insert_query = f"""
+        INSERT INTO dim.cts_ysfs_global_old (ysfs, cnt, gj, jck)
+        VALUES ('{row['_id']}' , '{row['count']}','{mgdb}', 'ex')
+        """
+        cursor.execute(insert_query)
+
+    # 关闭连接
+    cursor.close()
+    hive_conn.close()
+
+    jgj = ('----------------------'+
+           '\n结果1-->' + str(results1) +
+           '结果1end\n结果2-->' + str(results2) +
+           '结果2end\n合并后结果-->'+str(combined_results)+
+           '\n----------------------'
+    )
+    pretty_print(f'{jgj}')
+    return jgj
+
+def get_old_count(mgdb):
+    client = get_mongo_client('/../datasource/mongo/mongo-cts-prod-old.ini')
+    result = get_count(client, mgdb)
+    pretty_print(f'{NORM_MGT} old source mongo: {NORM_GRN}{mgdb} '
+                 f'{NORM_MGT} old data count: {NORM_GRN}')
+    return result
+
+
+def main():
+    # CONFIG, _ = parse_args(sys.argv[1:])
+    # for record in res:
+    # mgtbl = record['mgtbl_name']
+
+    # 解析命令行参数
+    args = parse_arguments()
+    mgdb = args.mgdb
+    old_cnt = get_old_count(mgdb)
+    return 0
+
+if __name__ == '__main__':
+    main()

+ 0 - 0
dw_base/scheduler/mg2es/__init__.py


+ 53 - 0
dw_base/scheduler/mg2es/conf_reader.py

@@ -0,0 +1,53 @@
+import sys
+import os
+import re
+
+abspath = os.path.abspath(__file__)
+root_path = re.sub(r"tendata-warehouse.*", "tendata-warehouse", abspath)
+sys.path.append(root_path)
+import json
+from configparser import ConfigParser
+
+import yaml
+
+from dw_base.scheduler.mg2es.path_util import PathUtil
+
+
+class ConfReader():
+
+    def get_yml_data(self, yml_file_path):
+        with open(yml_file_path, 'r') as file:
+            data = yaml.safe_load(file)
+        return data
+
+    def get_json_data(self, json_file_path):
+        with open(json_file_path) as f:
+            data = json.load(f)
+        return data
+
+    def get_es_conf(self):
+        path = PathUtil.get_es_conn_path()
+        print(path)
+        config_parser = ConfigParser()
+        config_parser.read(path)
+        host = config_parser.get('base', 'host')
+        port = int(config_parser.get('base', 'port'))
+        return host, port
+
+    def get_redis_conf(self):
+        path = PathUtil.get_redis_conn_path()
+        print(path)
+        config_parser = ConfigParser()
+        config_parser.read(path)
+        host = config_parser.get('base', 'host')
+        port = int(config_parser.get('base', 'port'))
+        db = int(config_parser.get('base', 'db'))
+        password = config_parser.get('base', 'password')
+        # 将空字符串密码转换为 None
+        password = password if password != '' else None
+        return host, port,db, password
+
+if __name__ == '__main__':
+    cf = ConfReader()
+    print(cf.get_es_conf())
+    print(cf.get_redis_conf())

+ 37 - 0
dw_base/scheduler/mg2es/dict_redis2hive.py

@@ -0,0 +1,37 @@
+import sys
+import os
+import re
+
+abspath = os.path.abspath(__file__)
+root_path = re.sub(r"tendata-warehouse.*", "tendata-warehouse", abspath)
+sys.path.append(root_path)
+from dw_base.scheduler.mg2es.conf_reader import ConfReader
+from dw_base.scheduler.mg2es.redis_operator import RedisOperator
+from dw_base.spark.spark_sql import SparkSQL
+from dw_base.utils.config_utils import parse_args
+
+if __name__ == '__main__':
+    CONFIG, _ = parse_args(sys.argv[1:])
+    dt = CONFIG.get('dt')
+    cf = ConfReader()
+    host, port, db, password = cf.get_redis_conf()
+    redis_client = RedisOperator(host, port, db, password)
+    spark = SparkSQL().get_spark_session()
+    state_dict = redis_client.get_hash_table_all('customs:state:dict')
+    country_dict = redis_client.get_hash_table_all('customs:country:dict')
+    state_df_dict = [{"field": k.decode(), "value": v.decode()} for k, v in state_dict.items()]
+    country_df_dict = [{"field": k.decode(), "value": v.decode()} for k, v in country_dict.items()]
+    state_df = spark.createDataFrame(state_df_dict)
+    country_df = spark.createDataFrame(country_df_dict)
+    # 注册DataFrame为临时视图
+    state_df.createOrReplaceTempView("redis_state_data")
+    country_df.createOrReplaceTempView("redis_country_data")
+    # 将数据写入Hive表
+    spark.sql("set hive.exec.dynamic.partition=true")
+    spark.sql("set hive.exec.dynamic.partition.mode=nonstrict")
+    spark.sql("set spark.yarn.queue=cts")
+    print("开始写入Hive表")
+    spark.sql(f"INSERT overwrite TABLE dim.redis_cts_state_dict SELECT *,'{dt}' FROM redis_state_data")
+    spark.sql(f"INSERT overwrite TABLE dim.redis_cts_country_dict SELECT *,'{dt}' FROM redis_country_data")
+    # 停止SparkSession
+    spark.stop()

+ 47 - 0
dw_base/scheduler/mg2es/es_index_backup.py

@@ -0,0 +1,47 @@
+import sys
+import os
+import re
+# 配置参数示例  -catalog=imports -database_name=venezuela_bol -year=2023
+abspath = os.path.abspath(__file__)
+root_path = re.sub(r"tendata-warehouse.*", "tendata-warehouse", abspath)
+sys.path.append(root_path)
+import sys
+from time import sleep
+
+from dw_base.scheduler.mg2es.conf_reader import ConfReader
+from dw_base.scheduler.mg2es.es_operator import ESOperator
+from dw_base.utils.config_utils import parse_args
+
+
+if __name__ == '__main__':
+    CONFIG, _ = parse_args(sys.argv[1:])
+    catalog = CONFIG.get('catalog')
+    database_name = CONFIG.get('database_name')
+    env = CONFIG.get('env','test')
+    host='192.168.0.200'
+    port='9201'
+    if env == 'prod':
+        host = '192.168.11.100'
+        port = '9003'
+    year = CONFIG.get('year')
+    bak_suffix = CONFIG.get('bak_suffix','bak')
+    es_operator = ESOperator(host, port)
+    index_name = f'customs_{catalog}_{database_name}-{year}'
+    bak_index_name = f'{index_name}-{bak_suffix}'
+    es_operator.create_index(bak_index_name)
+    task_id = es_operator.reindex(index_name, bak_index_name)['task']
+    total_time = 0
+    while True:
+        sleep(60)
+        total_time += 60
+        task_info = es_operator.get_task_status(task_id)
+        if task_info['completed'] == True:
+            print('迁移完成--------------------------')
+            print(f'迁移耗时:{total_time}秒')
+            cnt = es_operator.get_index_document_count(bak_index_name)
+            print(f'迁移文档数:{cnt}')
+            break
+        else:
+            print('迁移中----------------------------')
+            print(task_info)
+    # es_operator.delete_index(index_name)

+ 214 - 0
dw_base/scheduler/mg2es/es_operator.py

@@ -0,0 +1,214 @@
+import sys
+import os
+import re
+
+abspath = os.path.abspath(__file__)
+root_path = re.sub(r"tendata-warehouse.*", "tendata-warehouse", abspath)
+sys.path.append(root_path)
+import json
+from elasticsearch import Elasticsearch
+from elasticsearch.exceptions import NotFoundError
+
+from dw_base import NORM_CYN, NORM_RED, NORM_BLU, NORM_MGT
+
+
+class ESOperator:
+
+    def __init__(self, host, port, timeout=30):
+
+        self.es = Elasticsearch([{'host': host, 'port': port}], timeout=timeout)
+
+    def get_all_indices(self):
+        try:
+            indices = self.es.indices.get_alias("*").keys()
+            return list(indices)
+        except Exception as e:
+            print("Error:", e)
+            return []
+
+    def get_cts_indices(self, catalog, database_name):
+        indices = self.get_all_indices()
+        return [index for index in indices if catalog in index and database_name in index]
+
+    def get_aliases_for_index(self, index_name):
+        aliases = self.es.indices.get_alias(index=index_name)
+        return list(aliases[index_name]['aliases'].keys())
+
+    def add_alias_to_index(self, index_name, alias_name):
+        self.es.indices.put_alias(index=index_name, name=alias_name)
+
+    def get_indices_by_alias(self, alias_name):
+        result = self.es.indices.get_alias(name=alias_name)
+        return list(result.keys())
+
+    def get_random_documents(self, index_name, size=10):
+        try:
+            # 构造随机排序查询
+            query_body = {
+                "size": size,
+                "query": {
+                    "function_score": {
+                        "query": {"match_all": {}},
+                        "random_score": {}
+                    }
+                }
+            }
+            result = self.es.search(index=index_name, body=query_body)
+            return result['hits']['hits']
+        except Exception as e:
+            print("Error:", e)
+            return []
+
+    def get_random_doc_with_id(self, index_name, size=10):
+        doc_list = self.get_random_documents(index_name, size)
+        return {d['_id']: d['_source'] for d in doc_list}
+
+    # 注意!此方法为异步方法,要想获取任务执行结果需配合get_task_status方法使用
+    def reindex(self, source_index, target_index):
+        body = {
+            "source": {
+                "index": source_index
+            },
+            "dest": {
+                "index": target_index
+            }
+        }
+        response = self.es.reindex(body=body, wait_for_completion=False)
+        return response
+
+    def get_task_status(self, task_id):
+        try:
+            task_info = self.es.tasks.get(task_id)
+            return task_info
+        except NotFoundError:
+            return None
+
+    def get_data_from_ids(self, index_name, id_list):
+        doc_list = [self.es.get(index=index_name, id=id) for id in id_list]
+        return {d['_id']: d['_source'] for d in doc_list}
+
+    def delete_index(self, index_name):
+        try:
+            response = self.es.indices.delete(index=index_name, ignore=[400, 404])
+            if response['acknowledged']:
+                print(f"Index '{index_name}' deleted successfully.")
+            else:
+                print(f"Failed to delete index '{index_name}'.")
+        except Exception as e:
+            print("Error:", e)
+
+    def dict_diff(self, old_dict, new_dict):
+        old_keys = set(old_dict.keys())
+        new_keys = set(new_dict.keys())
+        old_only_keys = old_keys - new_keys
+        new_only_keys = new_keys - old_keys
+        common_keys = old_keys & new_keys
+        if old_only_keys:
+            print(f"{NORM_CYN} old_only_keys:")
+            for key in old_only_keys:
+                print(f"{NORM_BLU}      {key} :{new_dict[key]}")
+        if new_only_keys:
+            print(f"{NORM_CYN} new_only_keys:")
+            for key in new_only_keys:
+                print(f"{NORM_BLU}      {key} :{new_dict[key]}")
+        diff_data = {}
+        for key in common_keys:
+            if old_dict[key] != new_dict[key] or type(old_dict[key]) != type(new_dict[key]):
+                diff_data[key] = (old_dict[key], new_dict[key])
+        if diff_data:
+            print(f"{NORM_CYN} diff_data:")
+            for key in diff_data:
+                print(f"{NORM_BLU}      {key}: ")
+                print(
+                    f"{NORM_RED}          value:{NORM_MGT} {diff_data[key][0]}{NORM_RED} -> {NORM_MGT}{diff_data[key][1]}")
+                print(
+                    f"{NORM_RED}          type:{NORM_MGT} {type(old_dict[key])}{NORM_RED} -> {NORM_MGT}{type(new_dict[key])}")
+
+    def get_data_from_id(self, index_name, id):
+        response = self.es.get(index=index_name, id=id)
+        return response['_source']
+
+    def create_index_from_json(self, index_name, settings_and_mappings):
+        try:
+            self.es.indices.create(index=index_name, body=settings_and_mappings)
+            print(f"Index '{index_name}' created successfully.")
+        except Exception as e:
+            print(f"Error creating index '{index_name}':", e)
+
+    def create_index(self, index_name):
+        try:
+            if self.es.indices.exists(index=index_name):
+                print(f"Index '{index_name}' already exists.")
+                return False
+            self.es.indices.create(index=index_name)
+            print(f"Index '{index_name}' created successfully.")
+            return True
+        except Exception as e:
+            print(f"Error creating index '{index_name}': {e}")
+            return False
+
+    def get_index_document_count(self, index_name):
+        try:
+            result = self.es.count(index=index_name)
+            return result['count']
+        except Exception as e:
+            print("Error:", e)
+            return None
+
+    def random_diff(self, new_index, old_index):
+        new_dicts = self.get_random_doc_with_id(new_index)
+        id_list = [id for id in new_dicts.keys()]
+        old_dicts = self.get_data_from_ids(old_index, id_list)
+        for id in id_list:
+            print(f'【id:{id}】------------------------------------------------------')
+            self.dict_diff(old_dicts[id], new_dicts[id])
+
+    def refresh(self, index):
+        if not index:
+            raise ValueError("Index name must be specified.")
+
+        try:
+            self.es.indices.refresh(index=index)
+            print(f"Index {index} refreshed.")
+        except Exception as e:
+            print("Error during refresh:", e)
+
+    # 此方法很重,不建议在生产环境中使用(可能导致线上性能下降)
+    def flush(self, index):
+        if not index:
+            raise ValueError("Index name must be specified.")
+
+        try:
+            self.es.indices.flush(index=index)
+            print(f"Index {index} flushed.")
+        except Exception as e:
+            print("Error during flush:", e)
+
+
+if __name__ == '__main__':
+    # es_operator = ESOperator('192.168.0.200', 9201)
+    es_operator = ESOperator('192.168.11.99', 9005)
+    es_operator.get_data_from_id('corp','b7730f7f75f47296e9261eb5934b140a')
+    # es_operator.refresh('customs_imports_venezuela-2020test')
+    # es_operator.refresh('customs_exports_pakistan-2020test')
+    es_operator.random_diff('customs_exports_mexico-2020test', 'customs_exports_mexico-2020')
+    # print(es_operator.get_cts_indices('exports', 'kazakhstan'))
+    # es_operator.add_alias_to_index('customs_exports_kazakhstan-2023-ctytest', 'cts_kazakhstan_ex-2023-ctytest')
+    # print(es_operator.get_aliases_for_index('customs_exports_kazakhstan-2023-ctytest'))
+    # print(es_operator.get_indices_by_alias('cts_kazakhstan_ex-2023-ctytest'))
+    # es_operator.reindex('customs_exports_kazakhstan-2023','customs_exports_kazakhstan-2023-bak')
+    # old_dict = es_operator.get_random_doc_with_id('customs_exports_kazakhstan-2023-bak')
+    # id_list = [id for id in old_dict.keys()]
+    # new_dict = es_operator.get_data_from_ids('customs_exports_kazakhstan-2023-ctytest', id_list)
+    # for id in id_list:
+    #     print(f'【id:{id}】------------------------------------------------------')
+    #     print(old_dict[id])
+    #     print(new_dict[id])
+    #     es_operator.dict_diff( old_dict[id],new_dict[id])
+    # old = es_operator.get_data_from_id('customs_exports_kazakhstan-2023-benchmark', '656d8f637e0d39686b8206e2')
+    # new = es_operator.get_data_from_id('customs_exports_kazakhstan-2023-bak', '656d8f637e0d39686b8206e2')
+    # print(old['exporterOrig'])
+    # print(new['exporterOrig'])
+    # rp = es_operator.reindex('customs_exports_kazakhstan-2023-ctytest','customs_exports_kazakhstan-2023-bak')
+    # print(rp)
+    # es_operator.delete_index('customs_exports_kazakhstan-2023-bak')

+ 250 - 0
dw_base/scheduler/mg2es/es_tmpl_gen.py

@@ -0,0 +1,250 @@
+import sys
+import os
+import re
+
+abspath = os.path.abspath(__file__)
+root_path = re.sub(r"tendata-warehouse.*", "tendata-warehouse", abspath)
+sys.path.append(root_path)
+import json
+
+from dw_base.scheduler.mg2es.path_util import PathUtil
+from dw_base.scheduler.mg2es.conf_reader import ConfReader
+
+
+class EsTmplGen:
+    def __init__(self, catalog, database_name):
+        self.catalog = catalog
+        self.database_name = database_name
+        es_json_path, mg2es_mapping_path = PathUtil.get_conf_abspath(catalog, database_name)
+        conf_reader = ConfReader()
+        self.yml_dict = conf_reader.get_yml_data(mg2es_mapping_path)
+        self.es_json = conf_reader.get_json_data(es_json_path)
+        self.type_dict = {
+            'date': 'string',
+            'text': 'string',
+            'keyword': 'string',
+            'scaled_float': 'double',
+        }
+
+        self.catalog_dict = {
+            'exports': 'ex',
+            'imports': 'im'}
+
+    def get_clos_with_type(self):
+        yml_fields = self.yml_dict['transformer']['mapping']['fields']
+        for field in yml_fields:
+            if field.get('name') == 'productTag':
+                yml_fields.remove(field)
+        yml_clos = [c['name'] for c in yml_fields]
+        # 将_id替换成id
+        yml_clos[0] = 'id'
+        # 数组类型:字段handler为text_split_handler,或source类型为数组
+        arr_clos = [c['name'] for c in yml_fields if
+                    c.get('handler') == 'text_split_handler' or isinstance(c.get('source'), list)]
+        # print(f'全字段:{yml_clos}')
+        # print(f'数组类型字段:{arr_clos}')
+        # 创建一个新的列表,用于存放键值对
+        json_list = {}
+        # 遍历 JSON 字典的键值对
+        for key, value in self.es_json['mappings']['properties'].items():
+            # 提取键和 type 的值,并组成新的键值对
+            json_list[key] = value['type']
+        # print(f'es类型:{json_list}')
+        res_list = []
+        for c in yml_clos:
+            if c in arr_clos:
+                res_list.append((c, 'array<string>'))
+            elif c in json_list:
+                res_list.append((c, self.type_dict[json_list[c]]))
+            else:
+                res_list.append((c, 'string'))
+        return res_list
+
+    def make_ddl_body(self):
+        clos_with_type = self.get_clos_with_type()
+        clos_len = [len(c[0]) for c in clos_with_type]
+        max_len = max(clos_len) + 2
+        formatted_clos = ['\t{:<{width}} {}'.format(f'`{c[0]}`', c[1], width=max_len) for c in clos_with_type]
+        clos_str = ",\n".join(formatted_clos)
+        return clos_str
+
+    def make_2es_ddl(self):
+        clos_str = self.make_ddl_body()
+        ddl = (f'create table to_es.cts_{self.database_name}_{self.catalog_dict[self.catalog]}\n'
+               f'(\n'
+               f'{clos_str}'
+               f'\n) PARTITIONED BY ( `dt` string,year_from_date string) \n'
+               f'\tSTORED AS ORC'
+               )
+        return ddl
+
+    def make_es_mapping_ddl(self):
+        clos_str = self.make_ddl_body()
+        clos = [f'{ct[0]}:{ct[0]}' for ct in self.get_clos_with_type()]
+        clos = clos[1:]
+        mapping_prop = ','.join(clos)
+        ddl = (
+            f'create external table if not exists to_es.es_cts_{self.database_name}_{self.catalog_dict[self.catalog]}_yearNeedReplace\n'
+            f'(\n'
+            f'{clos_str}'
+            f"\n) STORED BY 'org.elasticsearch.hadoop.hive.EsStorageHandler' \n"
+            f'\tTBLPROPERTIES ('
+            f'''\n'es.nodes' = '192.168.11.100',
+        'es.port' = '9003',
+        'es.http.timeout' = '100m',
+        'es.input.use.sliced.partitions' = 'false',
+        'es.input.json' = 'false',
+        'es.index.auto.create' = 'true',
+        'es.write.operation' = 'upsert',
+        'es.mapping.date.rich' = 'false',
+        'es.batch.write.refresh' = 'false',
+        'es.batch.size.bytes' = '60mb',
+        'es.batch.size.entries' = '5000',
+        'es.batch.write.retry.count' = '10',
+        'es.batch.write.retry.wait' = '60s',
+        'es.update.retry.on.conflict' = '5' ,
+        'es.resource' = 'customs_{self.catalog}_{self.database_name}-yearNeedReplace/_doc',
+        'es.mapping.id' = 'id',
+        'es.mapping.names' =
+            '{mapping_prop}')'''
+        )
+        return ddl
+
+    def make_2es_dml(self):
+        ct_list = self.get_clos_with_type()[2:]
+        yml_fields = self.yml_dict['transformer']['mapping']['fields'][2:]
+        field_list = []
+        filed_dict = {}
+        handler_list = []
+        for i in range(len(ct_list)):
+            yml_field = yml_fields[i]
+            field_tuple = self.get_field(yml_field)
+            field = field_tuple[0]
+            if ct_list[i][1] == 'string':
+                field = f"merge_ws({field})"
+            field_sql = f"{field} as `{field_tuple[1]}`"
+            field_list.append(field_sql)
+            filed_dict[field_tuple[1]] = field_tuple[0]
+            if 'handler' in yml_field:
+                if 'dict_handler' in yml_field['handler']:
+                    handler_list.append(yml_field)
+        dim_join_sql = self.get_dim_join_sql(handler_list, filed_dict)
+        dml_body = '\n\t, '.join(field_list)
+        dml = (f'insert overwrite table to_es.cts_{self.database_name}_{self.catalog_dict[self.catalog]}'
+               f'\nselect i.`id`'
+               f"\n\t, concat(replace(from_unixtime((i.`date` / 1000) - 8 * 60 * 60, 'yyyy-MM-dd HH:mm:ss'),' ','T'),'Z') as `date`"
+               f'\n\t, {dml_body}'
+               f'\n\t, i.`dt`'
+               f"\n\t, from_unixtime((i.`date` / 1000) - 8 * 60 * 60, 'yyyy')                         as `year_from_date`"
+               f'\nfrom to_mongo.cts_{self.database_name}_{self.catalog_dict[self.catalog]} i'
+               f'\n{dim_join_sql}'
+               f'\nwhere i.dt = "dtNeedReplace"'
+               )
+        return dml
+
+    def make_es_mapping_dml(self):
+        clos_with_type = self.get_clos_with_type()
+        clos = [f'i.`{c[0]}`' for c in clos_with_type]
+        dml_body = '\n     , '.join(clos)
+        dml = (
+            f'insert overwrite table to_es.es_cts_{self.database_name}_{self.catalog_dict[self.catalog]}_yearNeedReplace'
+            f'\nselect {dml_body}'
+            f'\nfrom to_es.cts_{self.database_name}_{self.catalog_dict[self.catalog]} i'
+            '\nwhere dt = "dtNeedReplace" and year_from_date = "yearNeedReplace"')
+        return dml
+
+    def get_field(self, fields):
+        name = fields["name"]
+        field = ''
+        if 'default' in fields and 'source' not in fields:
+            field = f"'{fields['default']}'"
+        if 'handler' in fields:
+            if 'dict_handler' in fields['handler']:
+                field = f'{fields["name"]}_dim.`value`'
+            elif fields['handler'] == 'text_split_handler':
+                d = fields['delimiter']
+                if len(d) == 1:
+                    delimiter = d[0]
+                    field = f"array_distinct(split(i.`{fields['source']}`,'{delimiter}'))"
+                else:
+                    delimiter = fields['delimiter'][1]
+                    delimiters = fields['delimiter'][2:-1]
+                    field = f"array_distinct(split(regexp_replace(i.`{fields['source']}`, '[{delimiters}]', '{delimiter}'),'{delimiter}'))"
+        if 'source' in fields and 'handler' not in fields:
+            source = fields['source']
+            if isinstance(source, list):
+                # source = [f"`i.{s}`" for s in source]
+                source = [f"merge_ws(i.`{s}`)" for s in source]
+                # field = f"coalesce({','.join(source)})"
+                field = f"filter(array_distinct(array({','.join(source)})),x -> x is not null)"
+            else:
+                field = f"i.`{source}`"
+        if 'default' in fields and 'source' in fields:
+            field = f"coalesce({field}, '{fields['default']}')"
+        return (field, name)
+
+    def get_dim_join_sql(self, handler_list, filed_dict):
+        sql_list = []
+        for field in handler_list:
+            handler = field['handler']
+            dim = f'{field["name"]}_dim'
+            source = field['source']
+            if '__' in source:
+                source = filed_dict.get(source.split('__')[1])
+            else:
+                source = f'i.`{source}`'
+            if handler == 'country_dict_handler':
+                sql_list.append(
+                    f'left join dim.redis_cts_country_dict as {dim} on {dim}.dt = "dtNeedReplace" and lower({source}) = {dim}.`field`')
+            elif handler == 'state_dict_handler':
+                sql_list.append(
+                    f'left join dim.redis_cts_state_dict as {dim} on {dim}.dt = "dtNeedReplace" and lower({source}) = {dim}.`field`')
+        return '\n'.join(sql_list)
+
+    def replace_sql(self, es_bak_ddl, es_mapping_ddl, es_bak_dml, data_source):
+        if data_source == 'india_im':
+            es_bak_ddl = es_bak_ddl.replace("`importerAddress`          string,",
+                                            "`importerAddress`          array<string>,")
+            es_bak_ddl = es_bak_ddl.replace("`exporterAddress`          string,",
+                                            "`exporterAddress`          array<string>,")
+            es_mapping_ddl = es_mapping_ddl.replace("`importerAddress`          string,",
+                                                    "`importerAddress`          array<string>,")
+            es_mapping_ddl = es_mapping_ddl.replace("`exporterAddress`          string,",
+                                                    "`exporterAddress`          array<string>,")
+            es_bak_dml = es_bak_dml.replace("merge_ws(i.`jksdz`) as `importerAddress`",
+                                            "str_to_arr(i.`jksdz`) as `importerAddress`")
+            es_bak_dml = es_bak_dml.replace("merge_ws(i.`cksdz`) as `exporterAddress`",
+                                            "str_to_arr(i.`cksdz`) as `exporterAddress`")
+        if data_source == 'america_im':
+            es_bak_ddl = es_bak_ddl.replace("`importerAddress`          string,",
+                                            "`importerAddress`          array<string>,")
+            es_bak_ddl = es_bak_ddl.replace("`exporterAddress`          string,",
+                                            "`exporterAddress`          array<string>,")
+            es_bak_ddl = es_bak_ddl.replace("`notifyPartyAddress`       string,",
+                                            "`notifyPartyAddress`       array<string>,")
+            es_mapping_ddl = es_mapping_ddl.replace("`importerAddress`          string,",
+                                                    "`importerAddress`          array<string>,")
+            es_mapping_ddl = es_mapping_ddl.replace("`exporterAddress`          string,",
+                                                    "`exporterAddress`          array<string>,")
+            es_mapping_ddl = es_mapping_ddl.replace("`notifyPartyAddress`       string,",
+                                                    "`notifyPartyAddress`       array<string>,")
+            es_bak_dml = es_bak_dml.replace("merge_ws(i.`shrdz`) as `importerAddress`",
+                                            "str_to_arr(i.`shrdz`) as `importerAddress`")
+            es_bak_dml = es_bak_dml.replace("merge_ws(i.`fhrdz`) as `exporterAddress`",
+                                            "str_to_arr(i.`fhrdz`) as `exporterAddress`")
+            es_bak_dml = es_bak_dml.replace("merge_ws(i.`tzrdz`) as `notifyPartyAddress`",
+                                            "str_to_arr(i.`tzrdz`) as `notifyPartyAddress`")
+        return es_bak_ddl, es_mapping_ddl, es_bak_dml
+
+
+if __name__ == '__main__':
+    # es = EsDDLGen('exports', 'america')
+    es = EsTmplGen('imports', 'america')
+    print('\n\n--2es_ddl-------------------------------------------------------')
+    print(es.make_2es_ddl())
+    print('\n\n--es_mapping_ddl-------------------------------------------------------')
+    print(es.make_es_mapping_ddl())
+    print('\n\n--2es_dml-------------------------------------------------------')
+    print(es.make_2es_dml())
+    print('\n\n--es_mapping_dml-------------------------------------------------------')
+    print(es.make_es_mapping_dml())

Деякі файли не було показано, через те що забагато файлів було змінено