Kaynağa Gözat

refactor(tdm/usr): tdm stat 改跨层下钻 dwd + order_type='group' + tag_code 去 pref

1 期 dws 上层无消费方,tdm stat 跨层取数 dwd_trd_order_pay_apd_d 重新 group by,
加 order_type='group' 限定拼团粒度,加 category IS NOT NULL 过滤脏数据;
tag_code 从 usr_pref_trade_* 改 usr_trade_*(1 期为消费明细统计非偏好);
cnt 计算从 SUM(pay_order_cnt) 改 COUNT(DISTINCT order_id)(dwd 订单粒度);
dws_usr_user_trade_1d 保通用日聚合语义不动,留作未来 BI/2 期消费方.

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
tianyu.chu 4 hafta önce
ebeveyn
işleme
e42c6aecf9

+ 31 - 21
jobs/tdm/usr/tdm_usr_tag_d.sql

@@ -1,19 +1,21 @@
 -- 作者:tianyu.chu
--- 日期:2026-05-11
+-- 日期:2026-05-12
 -- 工单:(无)
 -- 目的:tdm_usr_tag_d 日常调度 + 手动 init 复用(kb/33 §2):
 --      INSERT OVERWRITE PARTITION (dt='${dt}') 静态单分区全量重刷;
 --      7 属性(attr) ← dim_usr_user_ful_d.dt='${dt}' 各属性 UNION ALL;
---      4 偏好窗口(stat) ← dws_usr_user_trade_1d 滚动 30d / y{当年} 累计聚合 UNION ALL;
+--      4 消费明细统计窗口(stat,1 期拼团订单粒度) ← dwd_trd_order_pay_apd_d 跨层下钻
+--          (1 期 dws 上层无消费方,跨层取数,kb/93 ADR-10)滚动 30d / y{当年} 累计聚合 UNION ALL;
 --      EAV 7 字段(kb/33 §1.2);WHERE 源字段 IS NOT NULL / HAVING SUM > 0 过滤空标签
 -- 状态:[草案]
 -- 备注:sched=T,${dt}=T-1(项目级 globalParam,kb/26);
 --      30d 滚动起点 = DATE_SUB(${dt}, 29) [T-30, T-1];
 --      y{当年} 累计起点 = '${dt}' 前 4 位 + '0101' = 当年 01-01;
 --      tag_code 当年通过 SUBSTR('${dt}', 1, 4) 拼接,跨年自然滚动(27-01-01 起自动从 y2026 切到 y2027);
+--      stat 段 WHERE 必带 order_type='group'(1 期拼团粒度,kb/34 §编码规则);
 --      属性细节口径(出生世代切片 / sex 原值 / 等)按 kb/33 §6 默认,业务回头校准换字段不动 schema(EAV 收益);
 --      birthday_cert 业务库 STRING 多格式,REPLACE 去 '-' 后取 yyyyMM/yyyy 兼容 'yyyy-MM-dd' 与 'yyyyMMdd';
---      前置 DS DEPENDENT:dim_usr_user_ful_d.${dt} + dws_usr_user_trade_1d.${dt}
+--      前置 DS DEPENDENT:dim_usr_user_ful_d.${dt} + dwd_trd_order_pay_apd_d.${dt}
 
 INSERT OVERWRITE TABLE tdm.tdm_usr_tag_d PARTITION (dt='${dt}')
 SELECT entity_id, tag_code, tag_value, tag_type, confidence, etl_time FROM (
@@ -59,7 +61,7 @@ SELECT entity_id, tag_code, tag_value, tag_type, confidence, etl_time FROM (
 
     UNION ALL
 
-    -- 4. usr_city 城市(取 cert_city,未实名 NULL 已 filter)
+    -- 4. usr_city 城市(取 cert_city,未实名 NULL 已 filter;真值为区级如"上海市徐汇区")
     SELECT
         user_id                                                  AS entity_id,
         'usr_city'                                               AS tag_code,
@@ -126,70 +128,78 @@ SELECT entity_id, tag_code, tag_value, tag_type, confidence, etl_time FROM (
       AND LENGTH(REPLACE(birthday_cert, '-', '')) >= 4
       AND SUBSTR(REPLACE(birthday_cert, '-', ''), 1, 4) RLIKE '^[12][0-9]{3}$'
 
-    -- ============ 偏好类 stat 16 品类 × 4 窗口 = 64 个 ============
+    -- ============ 消费明细统计 stat 16 品类 × 4 窗口 = 64 个(1 期拼团粒度) ============
 
     UNION ALL
 
-    -- 8. usr_pref_trade_{category}_amt_30d 16 品类 × 近 30 天金额
+    -- 8. usr_trade_{category}_amt_30d 16 品类 × 近 30 天金额
     SELECT
         user_id                                                  AS entity_id,
-        CONCAT('usr_pref_trade_', category, '_amt_30d')          AS tag_code,
+        CONCAT('usr_trade_', category, '_amt_30d')               AS tag_code,
         CAST(SUM(pay_amt_cny) AS STRING)                         AS tag_value,
         'stat'                                                   AS tag_type,
         CAST(1.0 AS DECIMAL(5,4))                                AS confidence,
         CURRENT_TIMESTAMP()                                      AS etl_time
-    FROM dws.dws_usr_user_trade_1d
+    FROM dwd.dwd_trd_order_pay_apd_d
     WHERE dt BETWEEN DATE_FORMAT(DATE_SUB(FROM_UNIXTIME(UNIX_TIMESTAMP('${dt}', 'yyyyMMdd')), 29), 'yyyyMMdd')
                 AND '${dt}'
+      AND order_type = 'group'
+      AND category IS NOT NULL
     GROUP BY user_id, category
     HAVING SUM(pay_amt_cny) > 0
 
     UNION ALL
 
-    -- 9. usr_pref_trade_{category}_cnt_30d 16 品类 × 近 30 天次数
+    -- 9. usr_trade_{category}_cnt_30d 16 品类 × 近 30 天次数
     SELECT
         user_id                                                  AS entity_id,
-        CONCAT('usr_pref_trade_', category, '_cnt_30d')          AS tag_code,
-        CAST(SUM(pay_order_cnt) AS STRING)                       AS tag_value,
+        CONCAT('usr_trade_', category, '_cnt_30d')               AS tag_code,
+        CAST(COUNT(DISTINCT order_id) AS STRING)                 AS tag_value,
         'stat'                                                   AS tag_type,
         CAST(1.0 AS DECIMAL(5,4))                                AS confidence,
         CURRENT_TIMESTAMP()                                      AS etl_time
-    FROM dws.dws_usr_user_trade_1d
+    FROM dwd.dwd_trd_order_pay_apd_d
     WHERE dt BETWEEN DATE_FORMAT(DATE_SUB(FROM_UNIXTIME(UNIX_TIMESTAMP('${dt}', 'yyyyMMdd')), 29), 'yyyyMMdd')
                 AND '${dt}'
+      AND order_type = 'group'
+      AND category IS NOT NULL
     GROUP BY user_id, category
-    HAVING SUM(pay_order_cnt) > 0
+    HAVING COUNT(DISTINCT order_id) > 0
 
     UNION ALL
 
-    -- 10. usr_pref_trade_{category}_amt_y{当年} 16 品类 × 当年累计金额
+    -- 10. usr_trade_{category}_amt_y{当年} 16 品类 × 当年累计金额
     SELECT
         user_id                                                  AS entity_id,
-        CONCAT('usr_pref_trade_', category, '_amt_y',
+        CONCAT('usr_trade_', category, '_amt_y',
                SUBSTR('${dt}', 1, 4))                            AS tag_code,
         CAST(SUM(pay_amt_cny) AS STRING)                         AS tag_value,
         'stat'                                                   AS tag_type,
         CAST(1.0 AS DECIMAL(5,4))                                AS confidence,
         CURRENT_TIMESTAMP()                                      AS etl_time
-    FROM dws.dws_usr_user_trade_1d
+    FROM dwd.dwd_trd_order_pay_apd_d
     WHERE dt BETWEEN CONCAT(SUBSTR('${dt}', 1, 4), '0101') AND '${dt}'
+      AND order_type = 'group'
+      AND category IS NOT NULL
     GROUP BY user_id, category
     HAVING SUM(pay_amt_cny) > 0
 
     UNION ALL
 
-    -- 11. usr_pref_trade_{category}_cnt_y{当年} 16 品类 × 当年累计次数
+    -- 11. usr_trade_{category}_cnt_y{当年} 16 品类 × 当年累计次数
     SELECT
         user_id                                                  AS entity_id,
-        CONCAT('usr_pref_trade_', category, '_cnt_y',
+        CONCAT('usr_trade_', category, '_cnt_y',
                SUBSTR('${dt}', 1, 4))                            AS tag_code,
-        CAST(SUM(pay_order_cnt) AS STRING)                       AS tag_value,
+        CAST(COUNT(DISTINCT order_id) AS STRING)                 AS tag_value,
         'stat'                                                   AS tag_type,
         CAST(1.0 AS DECIMAL(5,4))                                AS confidence,
         CURRENT_TIMESTAMP()                                      AS etl_time
-    FROM dws.dws_usr_user_trade_1d
+    FROM dwd.dwd_trd_order_pay_apd_d
     WHERE dt BETWEEN CONCAT(SUBSTR('${dt}', 1, 4), '0101') AND '${dt}'
+      AND order_type = 'group'
+      AND category IS NOT NULL
     GROUP BY user_id, category
-    HAVING SUM(pay_order_cnt) > 0
+    HAVING COUNT(DISTINCT order_id) > 0
 
 ) t;

+ 15 - 9
manual/backfill/20260511_tdm_usr_tag_o_y2025.sql

@@ -2,14 +2,16 @@
 -- 日期:2026-05-11
 -- 工单:(无)
 -- 目的:tdm_usr_tag_o y2025 凝固(kb/33 §3 tdm_usr_tag_o):
---      扫 dws.dws_usr_user_trade_1d.dt BETWEEN '20250101' AND '20251231' +
+--      扫 dwd.dwd_trd_order_pay_apd_d.dt BETWEEN '20250101' AND '20251231' +
+--      WHERE order_type='group'(1 期拼团粒度) + category IS NOT NULL +
 --      GROUP BY (user_id, category) 16 品类 × 金额+次数 = 32 tag_code +
 --      INSERT OVERWRITE PARTITION (dt='20251231') 单分区永久固定
 -- 状态:[待执行]
--- 备注:跑一次后该 dt 分区永远不动(insert-only 凝固语义,Kimball 周期快照事实表标准);
+-- 备注:跨层下钻 dwd(kb/93 ADR-10,1 期 dws 上层无消费方);
+--      跑一次后该 dt 分区永远不动(insert-only 凝固语义,Kimball 周期快照事实表标准);
 --      27-01-01 凝固 26 年时新落 manual/backfill/{date}_tdm_usr_tag_o_y2026.sql + dt='20261231',
 --      同表 tdm_usr_tag_o 多 dt 分区,不新建表;
---      tag_code 命名 usr_pref_trade_{category}_{amt|cnt}_y2025;
+--      tag_code 命名 usr_trade_{category}_{amt|cnt}_y2025(kb/34 §编码规则);
 --      WHERE 'dt BETWEEN' STRING 字典序对 yyyyMMdd 格式安全;
 --      HAVING SUM > 0 过滤空消费用户,EAV 习惯不存空标签
 
@@ -19,13 +21,15 @@ SELECT entity_id, tag_code, tag_value, tag_type, confidence, etl_time FROM (
     -- 16 品类 × 25 年金额
     SELECT
         user_id                                                  AS entity_id,
-        CONCAT('usr_pref_trade_', category, '_amt_y2025')        AS tag_code,
+        CONCAT('usr_trade_', category, '_amt_y2025')             AS tag_code,
         CAST(SUM(pay_amt_cny) AS STRING)                         AS tag_value,
         'stat'                                                   AS tag_type,
         CAST(1.0 AS DECIMAL(5,4))                                AS confidence,
         CURRENT_TIMESTAMP()                                      AS etl_time
-    FROM dws.dws_usr_user_trade_1d
+    FROM dwd.dwd_trd_order_pay_apd_d
     WHERE dt BETWEEN '20250101' AND '20251231'
+      AND order_type = 'group'
+      AND category IS NOT NULL
     GROUP BY user_id, category
     HAVING SUM(pay_amt_cny) > 0
 
@@ -34,14 +38,16 @@ SELECT entity_id, tag_code, tag_value, tag_type, confidence, etl_time FROM (
     -- 16 品类 × 25 年次数
     SELECT
         user_id                                                  AS entity_id,
-        CONCAT('usr_pref_trade_', category, '_cnt_y2025')        AS tag_code,
-        CAST(SUM(pay_order_cnt) AS STRING)                       AS tag_value,
+        CONCAT('usr_trade_', category, '_cnt_y2025')             AS tag_code,
+        CAST(COUNT(DISTINCT order_id) AS STRING)                 AS tag_value,
         'stat'                                                   AS tag_type,
         CAST(1.0 AS DECIMAL(5,4))                                AS confidence,
         CURRENT_TIMESTAMP()                                      AS etl_time
-    FROM dws.dws_usr_user_trade_1d
+    FROM dwd.dwd_trd_order_pay_apd_d
     WHERE dt BETWEEN '20250101' AND '20251231'
+      AND order_type = 'group'
+      AND category IS NOT NULL
     GROUP BY user_id, category
-    HAVING SUM(pay_order_cnt) > 0
+    HAVING COUNT(DISTINCT order_id) > 0
 
 ) t;