Bladeren bron

feat(tracking): 埋点脱敏核心 mask.py + tracking-mask.ini + 单测

tianyu.chu 6 dagen geleden
bovenliggende
commit
4fe76ea0fb
5 gewijzigde bestanden met toevoegingen van 257 en 0 verwijderingen
  1. 13 0
      conf/tracking-mask.ini
  2. 0 0
      dw_base/tracking/__init__.py
  3. 125 0
      dw_base/tracking/mask.py
  4. 0 0
      tests/unit/tracking/__init__.py
  5. 119 0
      tests/unit/tracking/test_mask.py

+ 13 - 0
conf/tracking-mask.ini

@@ -0,0 +1,13 @@
+# 埋点入仓脱敏配置(事件级)
+#
+# 仅声明含敏事件;未在此声明的事件 → 全字段原样入 raw(兜底,靠协作流程补含敏事件)。
+# 段名 [event:<事件名>] 对应 _source.event;规则作用于 properties 顶层与 properties.params 两层。
+#   drop = f1, f2            整字段删除,不入 raw
+#   mask = f3:method, ...    字段脱敏;method ∈ md5 / month_trunc / mask_middle / keep_first_n / keep_last_n
+# drop 与 mask 同字段时 drop 优先。方法语义单一真值见 conf/templates/datax/mask/mask.template.ini。
+#
+# 现状:仅录入探查到的已知 PII(PayOrder 收货四要素);全量敏感清单以埋点开发方文档为准后续增补。
+
+[event:PayOrder]
+# 收货四要素:收件人真名 / 手机号 / 地址(receiverArea 实测也塞了完整地址)—— 高敏,全部不入仓
+drop = receiverName, receiverTelephone, receiverArea, receiverAddress

+ 0 - 0
dw_base/tracking/__init__.py


+ 125 - 0
dw_base/tracking/mask.py

@@ -0,0 +1,125 @@
+# -*- coding:utf-8 -*-
+"""
+埋点入仓的字段级脱敏:在已解析的 properties dict 上按事件应用 drop / mask。
+
+与 dw_base/datax/mask.py 的区别:
+- datax/mask.py 在源 PG 端生成脱敏 querySql,SQL 在业务库执行(敏感原值不出业务库);
+- 本模块作用在已解析的 JSON dict 上 —— 埋点无源库,gz 文件已从固定服务器导出,
+  脱敏在入 raw 前的 Spark UDF 里做(kb/13 §4 合规破例)。
+两者共用同一套方法语义,单一真值见 conf/templates/datax/mask/mask.template.ini。
+
+兜底:未在配置声明的事件 → properties 原样返回,靠协作流程补含敏事件(kb/13 §6)。
+"""
+import hashlib
+import os
+import re
+from configparser import ConfigParser
+from copy import deepcopy
+
+_EVENT_PREFIX = 'event:'
+_KEEP_FIRST = re.compile(r'^keep_first_(\d+)$')
+_KEEP_LAST = re.compile(r'^keep_last_(\d+)$')
+_STATIC_METHODS = ('md5', 'month_trunc', 'mask_middle')
+
+
+def _to_text(value):
+    return value if isinstance(value, str) else str(value)
+
+
+def _known_method(method):
+    return method in _STATIC_METHODS or bool(_KEEP_FIRST.match(method)) or bool(_KEEP_LAST.match(method))
+
+
+def apply_method(method, value):
+    """按方法名脱敏单个值;value 为 None 原样返回;未知方法抛 ValueError。"""
+    if value is None:
+        return None
+    if method == 'md5':
+        return hashlib.md5(_to_text(value).encode('utf-8')).hexdigest()
+    if method == 'month_trunc':
+        # 'YYYY-MM-DD...' / 'YYYY/MM/...' 截断到 'YYYY-MM',不匹配则原样返回
+        m = re.match(r'^(\d{4})[-/](\d{2})', _to_text(value))
+        return '{0}-{1}'.format(m.group(1), m.group(2)) if m else _to_text(value)
+    if method == 'mask_middle':
+        # 对齐 datax 正则 (.{3}).+(.{4}):长度 < 8 不脱敏
+        text = _to_text(value)
+        return text[:3] + '****' + text[-4:] if len(text) >= 8 else text
+    m = _KEEP_FIRST.match(method)
+    if m:
+        n = int(m.group(1))
+        return _to_text(value)[:n] + '****'
+    m = _KEEP_LAST.match(method)
+    if m:
+        n = int(m.group(1))
+        return '****' + (_to_text(value)[-n:] if n else '')
+    raise ValueError(
+        '未知脱敏方法 {0!r}(可用 md5 / month_trunc / mask_middle / keep_first_n / keep_last_n)'.format(method)
+    )
+
+
+def apply_mask(event_name, properties, conf):
+    """
+    对单条事件的 properties dict 应用脱敏,返回脱敏后的新 dict(不修改入参)。
+
+    - 同时作用于 properties 顶层与嵌套的 properties['params']
+    - drop 字段整删,mask 字段调对应 method
+    - 同一字段同时在 drop 和 mask 时 drop 优先
+    - event_name 未在 conf → properties 原样返回(兜底)
+    """
+    rule = conf.get(event_name)
+    if not rule:
+        return properties
+    result = deepcopy(properties)
+    layers = [result]
+    params = result.get('params')
+    if isinstance(params, dict):
+        layers.append(params)
+    for field in rule.get('drop', ()):
+        for layer in layers:
+            layer.pop(field, None)
+    for field, method in rule.get('mask', {}).items():
+        for layer in layers:
+            if field in layer:
+                layer[field] = apply_method(method, layer[field])
+    return result
+
+
+def load_mask_conf(path):
+    """
+    解析脱敏配置 → {event_name: {'drop': [...field], 'mask': {field: method}}}。
+
+    文件不存在直接抛错:脱敏配置缺失会导致敏感数据原样入 raw,不可静默
+    (ConfigParser.read 对缺失文件静默返回,故先显式校验存在性)。
+    """
+    if not os.path.isfile(path):
+        raise FileNotFoundError('脱敏配置不存在:{0}'.format(path))
+    parser = ConfigParser()
+    parser.optionxform = str  # 保留字段名大小写(埋点字段是驼峰)
+    parser.read(path, encoding='utf-8')
+    conf = {}
+    for section in parser.sections():
+        if not section.startswith(_EVENT_PREFIX):
+            continue
+        event = section[len(_EVENT_PREFIX):]
+        rule = {}
+        if parser.has_option(section, 'drop'):
+            rule['drop'] = [f.strip() for f in parser.get(section, 'drop').split(',') if f.strip()]
+        if parser.has_option(section, 'mask'):
+            mask = {}
+            for item in parser.get(section, 'mask').split(','):
+                item = item.strip()
+                if not item:
+                    continue
+                if ':' not in item:
+                    raise ValueError('[mask] 项格式应为 field:method,实际 {0!r}(section [{1}])'.format(item, section))
+                field, method = item.split(':', 1)
+                field, method = field.strip(), method.strip()
+                if not _known_method(method):
+                    raise ValueError(
+                        '未知脱敏方法 {0!r}(section [{1}],可用 md5 / month_trunc / '
+                        'mask_middle / keep_first_n / keep_last_n)'.format(method, section)
+                    )
+                mask[field] = method
+            rule['mask'] = mask
+        conf[event] = rule
+    return conf

+ 0 - 0
tests/unit/tracking/__init__.py


+ 119 - 0
tests/unit/tracking/test_mask.py

@@ -0,0 +1,119 @@
+# -*- coding:utf-8 -*-
+import pytest
+
+from dw_base.tracking.mask import apply_mask, apply_method, load_mask_conf
+
+
+# ---- apply_method:6 种方法语义(对齐 datax/mask.py) ----
+
+def test_method_md5():
+    assert apply_method('md5', 'abc') == '900150983cd24fb0d6963f7d28e17f72'
+
+
+def test_method_mask_middle():
+    assert apply_method('mask_middle', '17301839727') == '173****9727'
+
+
+def test_method_mask_middle_short_unchanged():
+    assert apply_method('mask_middle', '1234567') == '1234567'  # len<8 不脱敏
+
+
+def test_method_keep_first():
+    assert apply_method('keep_first_2', 'abcdef') == 'ab****'
+
+
+def test_method_keep_last():
+    assert apply_method('keep_last_4', '17301839727') == '****9727'
+
+
+def test_method_month_trunc():
+    assert apply_method('month_trunc', '2024-05-13 10:00:00') == '2024-05'
+
+
+def test_method_month_trunc_non_date_unchanged():
+    assert apply_method('month_trunc', 'not-a-date') == 'not-a-date'
+
+
+def test_method_none_passthrough():
+    assert apply_method('md5', None) is None
+
+
+def test_method_unknown_raises():
+    with pytest.raises(ValueError, match='未知脱敏方法'):
+        apply_method('bogus', 'x')
+
+
+# ---- apply_mask:drop / mask / 兜底 / 优先级 / 嵌套 ----
+
+CONF = {
+    'PayOrder': {'drop': ['receiverName', 'receiverTelephone']},
+    'Login': {'mask': {'phone': 'keep_last_4'}},
+    'Mixed': {'drop': ['x'], 'mask': {'x': 'md5'}},
+}
+
+
+def test_drop_in_params_layer():
+    props = {'storeId': '31', 'params': {'receiverName': 'Ethan', 'orderNo': 'A1'}}
+    out = apply_mask('PayOrder', props, CONF)
+    assert 'receiverName' not in out['params']
+    assert out['params']['orderNo'] == 'A1'
+
+
+def test_apply_mask_does_not_mutate_input():
+    props = {'params': {'receiverName': 'Ethan'}}
+    apply_mask('PayOrder', props, CONF)
+    assert props['params']['receiverName'] == 'Ethan'
+
+
+def test_undeclared_event_passthrough():
+    props = {'params': {'whatever': '1'}}
+    assert apply_mask('$AppStart', props, CONF) == props
+
+
+def test_mask_top_level_field():
+    out = apply_mask('Login', {'phone': '17301839727'}, CONF)
+    assert out['phone'] == '****9727'
+
+
+def test_drop_beats_mask_same_field():
+    out = apply_mask('Mixed', {'x': 'secret'}, CONF)
+    assert 'x' not in out  # drop 优先:字段消失而非被 md5
+
+
+# ---- load_mask_conf:解析 / fail-fast ----
+
+def test_load_conf_parses_drop_and_mask(tmp_path):
+    p = tmp_path / 'tracking-mask.ini'
+    p.write_text(
+        '[event:PayOrder]\n'
+        'drop = receiverName, receiverTelephone\n'
+        'mask = receiverArea:mask_middle\n',
+        encoding='utf-8')
+    conf = load_mask_conf(str(p))
+    assert conf['PayOrder']['drop'] == ['receiverName', 'receiverTelephone']
+    assert conf['PayOrder']['mask'] == {'receiverArea': 'mask_middle'}
+
+
+def test_load_conf_ignores_non_event_sections(tmp_path):
+    p = tmp_path / 'm.ini'
+    p.write_text('[DEFAULT]\nfoo = bar\n[event:X]\ndrop = a\n', encoding='utf-8')
+    assert set(load_mask_conf(str(p))) == {'X'}
+
+
+def test_load_conf_missing_file_raises(tmp_path):
+    with pytest.raises(FileNotFoundError):
+        load_mask_conf(str(tmp_path / 'nope.ini'))
+
+
+def test_load_conf_unknown_method_raises(tmp_path):
+    p = tmp_path / 'm.ini'
+    p.write_text('[event:X]\nmask = f:bogus\n', encoding='utf-8')
+    with pytest.raises(ValueError, match='未知脱敏方法'):
+        load_mask_conf(str(p))
+
+
+def test_load_conf_bad_mask_format_raises(tmp_path):
+    p = tmp_path / 'm.ini'
+    p.write_text('[event:X]\nmask = noColon\n', encoding='utf-8')
+    with pytest.raises(ValueError, match='field:method'):
+        load_mask_conf(str(p))