1 ヶ月前 · 40fc29456c
--- a/bin/datax-sync-template-gen.py
+++ b/bin/datax-sync-template-gen.py
@@ -28,6 +28,7 @@ import argparse
 
															 import os
														
 
															 import re
														
 
															 import sys
														
 
															+from configparser import ConfigParser
														
 
															 from datetime import datetime
														
 
															 project_root = os.path.dirname(os.path.dirname(os.path.abspath(__file__)))
														
@@ -93,28 +94,65 @@ def query_columns_full(conn, schema, table):
 
															     return cur.fetchall()
														
 
															-def render_schema_md(rows):
														
 
															-    """输出 markdown 表格：序号 / 字段名 / 中文名 / 数据类型 / 主键标识 / 裁剪类型（空，开发者填）"""
														
 
															+def load_mask_conf(path):
														
 
															+    """读 mask 配置 ini，返回 {field: method} dict。
														
 
															+
														
 
															+    格式（与 jobs/raw/{域}/{table}.mask.ini 同款）：
														
 
															+        [mask]
														
 
															+        field1 = method1
														
 
															+        field2 = method2
														
 
															+
														
 
															+    method ∈ trim / md5 / month_trunc / mask_middle / keep_first_n / keep_last_n
														
 
															+    - trim：整字段不入 raw（reader column 不查询）
														
 
															+    - 其他：字段入 raw，由 dw_base.datax.mask 在 reader 端脱敏
														
 
															+    """
														
 
															+    cp = ConfigParser()
														
 
															+    cp.read(path, encoding='utf-8')
														
 
															+    if not cp.has_section('mask'):
														
 
															+        return {}
														
 
															+    return dict(cp.items('mask'))
														
 
															+
														
 
															+
														
 
															+def render_schema_md(rows, mask_dict=None):
														
 
															+    """输出 markdown 表格：序号 / 字段名 / 中文名 / 数据类型 / 主键标识 / 脱敏类型。
														
 
															+
														
 
															+    mask_dict 不传时脱敏类型列为空白；传入时填字段对应的 method（含 trim）。
														
 
															+    """
														
 
															     lines = [
														
 
															-        '| 序号 | 字段名 | 中文名 | 数据类型 | 主键标识 | 裁剪类型 |',
														
 
															+        '| 序号 | 字段名 | 中文名 | 数据类型 | 主键标识 | 脱敏类型 |',
														
 
															         '| --- | --- | --- | --- | --- | --- |',
														
 
															     ]
														
 
															+    methods = mask_dict or {}
														
 
															     for num, name, comment, typ, pk in rows:
														
 
															-        lines.append('| {} | `{}` | {} | {} | {} |  |'.format(
														
 
															-            num, name, comment or '', typ, pk))
														
 
															+        method = methods.get(name, '')
														
 
															+        lines.append('| {} | `{}` | {} | {} | {} | {} |'.format(
														
 
															+            num, name, comment or '', typ, pk, method))
														
 
															     return '\n'.join(lines) + '\n'
														
 
															-def render_template(ds_ref, database, schema, table, columns, pk):
														
 
															+def render_template(ds_ref, database, schema, table, columns, pk, mask_methods=None):
														
 
															+    """渲染 sync ini 模板。
														
 
															+
														
 
															+    columns: [(name, comment), ...] 已剔除 trim 字段，保持 PG 原顺序
														
 
															+    mask_methods: {field: method} 仅含非 trim 方法（mask_middle / month_trunc 等），
														
 
															+                  渲染 [mask] 段；空 dict 或 None 时不渲染 [mask] 段
														
 
															+    """
														
 
															     column_str = ','.join(c for c, _ in columns)
														
 
															     today = datetime.now().strftime('%Y-%m-%d')
														
 
															+
														
 
															+    if mask_methods:
														
 
															+        mask_lines = '\n'.join('{} = {}'.format(f, m) for f, m in mask_methods.items())
														
 
															+        mask_section = '[mask]\n' + mask_lines + '\n\n'
														
 
															+    else:
														
 
															+        mask_section = ''
														
 
															+
														
 
															     return (
														
 
															         '; 作者：<TODO>\n'
														
 
															         '; 日期：{today}\n'
														
 
															         '; 工单：<TODO>\n'
														
 
															         '; 目的：PG {database}.{schema}.{table} → Hive raw.<TODO> 同步模板\n'
														
 
															         '; 状态：[待执行]\n'
														
 
															-        '; 备注：自动生成的全字段参考模板。开发者按需裁剪字段 / 改 where / 加 [mask] /\n'
														
 
															+        '; 备注：自动生成的全字段参考模板。开发者按需裁剪字段 / 改 where / 加 mask 段 /\n'
														
 
															         ';       调 splitPk / 改 writer.path 表名后缀（_inc_d / _his_o 等）\n'
														
 
															         ';\n'
														
 
															         '; 配套 DDL：manual/ddl/raw/<TODO_domain>/raw_<TODO>_create.sql\n'
														
@@ -130,6 +168,7 @@ def render_template(ds_ref, database, schema, table, columns, pk):
 
															         'splitPk = {pk}\n'
														
 
															         'fetchSize = 1000\n'
														
 
															         '\n'
														
 
															+        '{mask_section}'
														
 
															         '[writer]\n'
														
 
															         'dataSource = hdfs/<TODO>\n'
														
 
															         'path = /user/hive/warehouse/raw.db/{table}_TODO_d/dt=${{dt}}/\n'
														
@@ -142,7 +181,7 @@ def render_template(ds_ref, database, schema, table, columns, pk):
 
															         'fieldDelimiter = \\t\n'
														
 
															     ).format(
														
 
															         today=today, ds_ref=ds_ref, database=database, schema=schema,
														
 
															-        table=table, column_str=column_str, pk=pk,
														
 
															+        table=table, column_str=column_str, pk=pk, mask_section=mask_section,
														
 
															     )
														
@@ -157,6 +196,8 @@ def main():
 
															                         help='schema 限定的表名（如 public.card_group_order_info）')
														
 
															     parser.add_argument('-o', nargs='?', const=WORKSPACE_DEFAULT, default=None, metavar='DIR',
														
 
															                         help='输出目录（不传 stdout 同时打印 md 表 + ini 模板；不带值 workspace/{yyyymmdd}/ 写两文件；带值自定义目录写两文件）')
														
 
															+    parser.add_argument('--mask-conf', default=None, metavar='PATH',
														
 
															+                        help='mask 配置 ini 路径（{table}.mask.ini）。传入时按配置剔除 trim 字段 + 渲染 [mask] 段，md 脱敏类型列填好；不传时全字段输出，md 脱敏类型列空白')
														
 
															     args = parser.parse_args()
														
 
															     if '.' not in args.t:
														
@@ -184,12 +225,18 @@ def main():
 
															         conn.close()
														
 
															     # full_rows: [(attnum, attname, comment, pg_type, pk_flag), ...]
														
 
															-    columns = [(r[1], r[2] or '') for r in full_rows]
														
 
															+    mask_dict = load_mask_conf(args.mask_conf) if args.mask_conf else {}
														
 
															+    trim_set = {f for f, m in mask_dict.items() if m == 'trim'}
														
 
															+    non_trim_mask = {f: m for f, m in mask_dict.items() if m != 'trim'}
														
 
															+
														
 
															+    # 已剔除 trim 字段的 column 列表，保持 PG 原顺序（attnum 升序）
														
 
															+    columns = [(r[1], r[2] or '') for r in full_rows if r[1] not in trim_set]
														
 
															+
														
 
															     pk_names = [r[1] for r in full_rows if r[4] == 'PK']
														
 
															-    pk = pk_names[0] if len(pk_names) == 1 else ''  # 复合主键 / 无主键 → 空
														
 
															+    pk = pk_names[0] if len(pk_names) == 1 and pk_names[0] not in trim_set else ''
														
 
															-    md_content = render_schema_md(full_rows)
														
 
															-    ini_content = render_template(args.ds, database, schema, table, columns, pk)
														
 
															+    md_content = render_schema_md(full_rows, mask_dict)
														
 
															+    ini_content = render_template(args.ds, database, schema, table, columns, pk, non_trim_mask)
														
 
															     if args.o is None:
														
 
															         # stdout：先 md 表后 ini 模板
														
--- a/tests/unit/datax/test_sync_template_gen.py
+++ b/tests/unit/datax/test_sync_template_gen.py
@@ -64,6 +64,25 @@ def test_render_template_includes_required_fields():
 
															     assert "where = update_time >= '${start_date}' AND update_time < '${stop_date}'" in out
														
 
															     assert 'path = /user/hive/warehouse/raw.db/users_TODO_d/dt=${dt}/' in out
														
 
															     assert 'fileName = users_TODO_d' in out
														
 
															+    # 不传 mask_methods 时不渲染 [mask] section header
														
 
															+    assert '\n[mask]\n' not in out
														
 
															+
														
 
															+
														
 
															+def test_render_template_with_mask_methods():
														
 
															+    columns = [('id', 'id'), ('user_name', '用户名'), ('phone', '手机号')]
														
 
															+    out = GEN.render_template(
														
 
															+        ds_ref='postgresql/prod-hobby', database='db', schema='public',
														
 
															+        table='users', columns=columns, pk='id',
														
 
															+        mask_methods={'user_name': 'mask_middle', 'phone': 'md5'},
														
 
															+    )
														
 
															+    # [mask] section header 在 [reader] 后 [writer] 前
														
 
															+    assert '\n[mask]\n' in out
														
 
															+    assert 'user_name = mask_middle' in out
														
 
															+    assert 'phone = md5' in out
														
 
															+    reader_idx = out.index('\n[reader]\n')
														
 
															+    mask_idx = out.index('\n[mask]\n')
														
 
															+    writer_idx = out.index('\n[writer]\n')
														
 
															+    assert reader_idx < mask_idx < writer_idx
														
 
															 def test_query_columns_full_returns_full_metadata():
														
@@ -80,19 +99,56 @@ def test_query_columns_full_returns_full_metadata():
 
															     ]
														
 
															-def test_render_schema_md_table_format():
														
 
															+def test_render_schema_md_no_mask_dict_blank_column():
														
 
															     rows = [
														
 
															         (1, 'id', 'id', 'bigint', 'PK'),
														
 
															         (2, 'user_name', '用户名', 'character varying', ''),
														
 
															-        (3, 'create_time', None, 'timestamp without time zone', ''),  # 无注释
														
 
															+        (3, 'create_time', None, 'timestamp without time zone', ''),
														
 
															     ]
														
 
															     out = GEN.render_schema_md(rows)
														
 
															-    assert '| 序号 | 字段名 | 中文名 | 数据类型 | 主键标识 | 裁剪类型 |' in out
														
 
															+    assert '| 序号 | 字段名 | 中文名 | 数据类型 | 主键标识 | 脱敏类型 |' in out
														
 
															     assert '| 1 | `id` | id | bigint | PK |  |' in out
														
 
															     assert '| 2 | `user_name` | 用户名 | character varying |  |  |' in out
														
 
															     assert '| 3 | `create_time` |  | timestamp without time zone |  |  |' in out
														
 
															+def test_render_schema_md_with_mask_dict():
														
 
															+    rows = [
														
 
															+        (1, 'id', 'id', 'bigint', 'PK'),
														
 
															+        (2, 'user_name', '用户名', 'character varying', ''),
														
 
															+        (3, 'phone', '手机号', 'character varying', ''),
														
 
															+        (4, 'merchant_open', '商家代开', 'smallint', ''),
														
 
															+    ]
														
 
															+    mask_dict = {'phone': 'md5', 'merchant_open': 'trim', 'user_name': 'mask_middle'}
														
 
															+    out = GEN.render_schema_md(rows, mask_dict)
														
 
															+    assert '| 1 | `id` | id | bigint | PK |  |' in out
														
 
															+    assert '| 2 | `user_name` | 用户名 | character varying |  | mask_middle |' in out
														
 
															+    assert '| 3 | `phone` | 手机号 | character varying |  | md5 |' in out
														
 
															+    assert '| 4 | `merchant_open` | 商家代开 | smallint |  | trim |' in out
														
 
															+
														
 
															+
														
 
															+def test_load_mask_conf_basic(tmp_path):
														
 
															+    p = tmp_path / 't.mask.ini'
														
 
															+    p.write_text(
														
 
															+        '[mask]\n'
														
 
															+        'payment_num = trim\n'
														
 
															+        'phone = md5\n'
														
 
															+        'name = mask_middle\n',
														
 
															+        encoding='utf-8',
														
 
															+    )
														
 
															+    assert GEN.load_mask_conf(str(p)) == {
														
 
															+        'payment_num': 'trim',
														
 
															+        'phone': 'md5',
														
 
															+        'name': 'mask_middle',
														
 
															+    }
														
 
															+
														
 
															+
														
 
															+def test_load_mask_conf_no_section_returns_empty(tmp_path):
														
 
															+    p = tmp_path / 't.mask.ini'
														
 
															+    p.write_text('[other]\nfoo = bar\n', encoding='utf-8')
														
 
															+    assert GEN.load_mask_conf(str(p)) == {}
														
 
															+
														
 
															+
														
 
															 def test_render_template_empty_pk():
														
 
															     out = GEN.render_template(
														
 
															         ds_ref='postgresql/prod-hobby', database='db', schema='public',