Pārlūkot izejas kodu

feat(tdm/usr): tdm_usr_tag_d + tdm_usr_tag_o DDL + sche + y2025 凝固(EAV 7 字段 + 71+32 tag_code)

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
tianyu.chu 2 dienas atpakaļ
vecāks
revīzija
d4a20206f6

+ 195 - 0
jobs/tdm/usr/tdm_usr_tag_d.sql

@@ -0,0 +1,195 @@
+-- 作者:tianyu.chu
+-- 日期:2026-05-11
+-- 工单:(无)
+-- 目的:tdm_usr_tag_d 日常调度 + 手动 init 复用(kb/33 §2):
+--      INSERT OVERWRITE PARTITION (dt='${dt}') 静态单分区全量重刷;
+--      7 属性(attr) ← dim_usr_user_ful_d.dt='${dt}' 各属性 UNION ALL;
+--      4 偏好窗口(stat) ← dws_usr_user_trade_1d 滚动 30d / y{当年} 累计聚合 UNION ALL;
+--      EAV 7 字段(kb/33 §1.2);WHERE 源字段 IS NOT NULL / HAVING SUM > 0 过滤空标签
+-- 状态:[草案]
+-- 备注:sched=T,${dt}=T-1(项目级 globalParam,kb/26);
+--      30d 滚动起点 = DATE_SUB(${dt}, 29) [T-30, T-1];
+--      y{当年} 累计起点 = '${dt}' 前 4 位 + '0101' = 当年 01-01;
+--      tag_code 当年通过 SUBSTR('${dt}', 1, 4) 拼接,跨年自然滚动(27-01-01 起自动从 y2026 切到 y2027);
+--      属性细节口径(出生世代切片 / sex 原值 / 等)按 kb/33 §6 默认,业务回头校准换字段不动 schema(EAV 收益);
+--      birthday_cert 业务库 STRING 多格式,REPLACE 去 '-' 后取 yyyyMM/yyyy 兼容 'yyyy-MM-dd' 与 'yyyyMMdd';
+--      前置 DS DEPENDENT:dim_usr_user_ful_d.${dt} + dws_usr_user_trade_1d.${dt}
+
+INSERT OVERWRITE TABLE tdm.tdm_usr_tag_d PARTITION (dt='${dt}')
+SELECT entity_id, tag_code, tag_value, tag_type, confidence, etl_time FROM (
+
+    -- ============ 属性类 attr 7 个 ============
+
+    -- 1. usr_level 用户等级
+    SELECT
+        user_id                                                  AS entity_id,
+        'usr_level'                                              AS tag_code,
+        CAST(member_level AS STRING)                             AS tag_value,
+        'attr'                                                   AS tag_type,
+        CAST(1.0 AS DECIMAL(5,4))                                AS confidence,
+        CURRENT_TIMESTAMP()                                      AS etl_time
+    FROM dim.dim_usr_user_ful_d
+    WHERE dt = '${dt}' AND member_level IS NOT NULL
+
+    UNION ALL
+
+    -- 2. usr_is_cert 实名认证情况
+    SELECT
+        user_id                                                  AS entity_id,
+        'usr_is_cert'                                            AS tag_code,
+        CAST(is_cert AS STRING)                                  AS tag_value,
+        'attr'                                                   AS tag_type,
+        CAST(1.0 AS DECIMAL(5,4))                                AS confidence,
+        CURRENT_TIMESTAMP()                                      AS etl_time
+    FROM dim.dim_usr_user_ful_d
+    WHERE dt = '${dt}' AND is_cert IS NOT NULL
+
+    UNION ALL
+
+    -- 3. usr_sex 性别(原值入,待业务确认映射规则)
+    SELECT
+        user_id                                                  AS entity_id,
+        'usr_sex'                                                AS tag_code,
+        CAST(sex_cert AS STRING)                                 AS tag_value,
+        'attr'                                                   AS tag_type,
+        CAST(1.0 AS DECIMAL(5,4))                                AS confidence,
+        CURRENT_TIMESTAMP()                                      AS etl_time
+    FROM dim.dim_usr_user_ful_d
+    WHERE dt = '${dt}' AND sex_cert IS NOT NULL
+
+    UNION ALL
+
+    -- 4. usr_city 城市(取 cert_city,未实名 NULL 已 filter)
+    SELECT
+        user_id                                                  AS entity_id,
+        'usr_city'                                               AS tag_code,
+        cert_city                                                AS tag_value,
+        'attr'                                                   AS tag_type,
+        CAST(1.0 AS DECIMAL(5,4))                                AS confidence,
+        CURRENT_TIMESTAMP()                                      AS etl_time
+    FROM dim.dim_usr_user_ful_d
+    WHERE dt = '${dt}' AND cert_city IS NOT NULL
+
+    UNION ALL
+
+    -- 5. usr_register_time 注册时间(yyyyMMdd)
+    SELECT
+        user_id                                                  AS entity_id,
+        'usr_register_time'                                      AS tag_code,
+        DATE_FORMAT(reg_create_time, 'yyyyMMdd')                 AS tag_value,
+        'attr'                                                   AS tag_type,
+        CAST(1.0 AS DECIMAL(5,4))                                AS confidence,
+        CURRENT_TIMESTAMP()                                      AS etl_time
+    FROM dim.dim_usr_user_ful_d
+    WHERE dt = '${dt}' AND reg_create_time IS NOT NULL
+
+    UNION ALL
+
+    -- 6. usr_birth_month 生日年月(yyyyMM,REPLACE 去 '-' 兼容多格式)
+    SELECT
+        user_id                                                  AS entity_id,
+        'usr_birth_month'                                        AS tag_code,
+        SUBSTR(REPLACE(birthday_cert, '-', ''), 1, 6)            AS tag_value,
+        'attr'                                                   AS tag_type,
+        CAST(1.0 AS DECIMAL(5,4))                                AS confidence,
+        CURRENT_TIMESTAMP()                                      AS etl_time
+    FROM dim.dim_usr_user_ful_d
+    WHERE dt = '${dt}'
+      AND birthday_cert IS NOT NULL
+      AND LENGTH(REPLACE(birthday_cert, '-', '')) >= 6
+      AND SUBSTR(REPLACE(birthday_cert, '-', ''), 1, 4) RLIKE '^[12][0-9]{3}$'
+
+    UNION ALL
+
+    -- 7. usr_generation 出生世代(10 年切片中文 N 后,kb/33 §6)
+    SELECT
+        user_id                                                  AS entity_id,
+        'usr_generation'                                         AS tag_code,
+        CASE
+            WHEN CAST(SUBSTR(REPLACE(birthday_cert, '-', ''), 1, 4) AS INT) < 1960 THEN '60前'
+            WHEN CAST(SUBSTR(REPLACE(birthday_cert, '-', ''), 1, 4) AS INT) < 1970 THEN '60后'
+            WHEN CAST(SUBSTR(REPLACE(birthday_cert, '-', ''), 1, 4) AS INT) < 1980 THEN '70后'
+            WHEN CAST(SUBSTR(REPLACE(birthday_cert, '-', ''), 1, 4) AS INT) < 1985 THEN '80后'
+            WHEN CAST(SUBSTR(REPLACE(birthday_cert, '-', ''), 1, 4) AS INT) < 1990 THEN '85后'
+            WHEN CAST(SUBSTR(REPLACE(birthday_cert, '-', ''), 1, 4) AS INT) < 1995 THEN '90后'
+            WHEN CAST(SUBSTR(REPLACE(birthday_cert, '-', ''), 1, 4) AS INT) < 2000 THEN '95后'
+            WHEN CAST(SUBSTR(REPLACE(birthday_cert, '-', ''), 1, 4) AS INT) < 2005 THEN '00后'
+            WHEN CAST(SUBSTR(REPLACE(birthday_cert, '-', ''), 1, 4) AS INT) < 2010 THEN '05后'
+            ELSE '10后'
+        END                                                      AS tag_value,
+        'attr'                                                   AS tag_type,
+        CAST(1.0 AS DECIMAL(5,4))                                AS confidence,
+        CURRENT_TIMESTAMP()                                      AS etl_time
+    FROM dim.dim_usr_user_ful_d
+    WHERE dt = '${dt}'
+      AND birthday_cert IS NOT NULL
+      AND LENGTH(REPLACE(birthday_cert, '-', '')) >= 4
+      AND SUBSTR(REPLACE(birthday_cert, '-', ''), 1, 4) RLIKE '^[12][0-9]{3}$'
+
+    -- ============ 偏好类 stat 16 品类 × 4 窗口 = 64 个 ============
+
+    UNION ALL
+
+    -- 8. usr_pref_trade_{category}_amt_30d 16 品类 × 近 30 天金额
+    SELECT
+        user_id                                                  AS entity_id,
+        CONCAT('usr_pref_trade_', category, '_amt_30d')          AS tag_code,
+        CAST(SUM(pay_amt_cny) AS STRING)                         AS tag_value,
+        'stat'                                                   AS tag_type,
+        CAST(1.0 AS DECIMAL(5,4))                                AS confidence,
+        CURRENT_TIMESTAMP()                                      AS etl_time
+    FROM dws.dws_usr_user_trade_1d
+    WHERE dt BETWEEN DATE_FORMAT(DATE_SUB(FROM_UNIXTIME(UNIX_TIMESTAMP('${dt}', 'yyyyMMdd')), 29), 'yyyyMMdd')
+                AND '${dt}'
+    GROUP BY user_id, category
+    HAVING SUM(pay_amt_cny) > 0
+
+    UNION ALL
+
+    -- 9. usr_pref_trade_{category}_cnt_30d 16 品类 × 近 30 天次数
+    SELECT
+        user_id                                                  AS entity_id,
+        CONCAT('usr_pref_trade_', category, '_cnt_30d')          AS tag_code,
+        CAST(SUM(pay_order_cnt) AS STRING)                       AS tag_value,
+        'stat'                                                   AS tag_type,
+        CAST(1.0 AS DECIMAL(5,4))                                AS confidence,
+        CURRENT_TIMESTAMP()                                      AS etl_time
+    FROM dws.dws_usr_user_trade_1d
+    WHERE dt BETWEEN DATE_FORMAT(DATE_SUB(FROM_UNIXTIME(UNIX_TIMESTAMP('${dt}', 'yyyyMMdd')), 29), 'yyyyMMdd')
+                AND '${dt}'
+    GROUP BY user_id, category
+    HAVING SUM(pay_order_cnt) > 0
+
+    UNION ALL
+
+    -- 10. usr_pref_trade_{category}_amt_y{当年} 16 品类 × 当年累计金额
+    SELECT
+        user_id                                                  AS entity_id,
+        CONCAT('usr_pref_trade_', category, '_amt_y',
+               SUBSTR('${dt}', 1, 4))                            AS tag_code,
+        CAST(SUM(pay_amt_cny) AS STRING)                         AS tag_value,
+        'stat'                                                   AS tag_type,
+        CAST(1.0 AS DECIMAL(5,4))                                AS confidence,
+        CURRENT_TIMESTAMP()                                      AS etl_time
+    FROM dws.dws_usr_user_trade_1d
+    WHERE dt BETWEEN CONCAT(SUBSTR('${dt}', 1, 4), '0101') AND '${dt}'
+    GROUP BY user_id, category
+    HAVING SUM(pay_amt_cny) > 0
+
+    UNION ALL
+
+    -- 11. usr_pref_trade_{category}_cnt_y{当年} 16 品类 × 当年累计次数
+    SELECT
+        user_id                                                  AS entity_id,
+        CONCAT('usr_pref_trade_', category, '_cnt_y',
+               SUBSTR('${dt}', 1, 4))                            AS tag_code,
+        CAST(SUM(pay_order_cnt) AS STRING)                       AS tag_value,
+        'stat'                                                   AS tag_type,
+        CAST(1.0 AS DECIMAL(5,4))                                AS confidence,
+        CURRENT_TIMESTAMP()                                      AS etl_time
+    FROM dws.dws_usr_user_trade_1d
+    WHERE dt BETWEEN CONCAT(SUBSTR('${dt}', 1, 4), '0101') AND '${dt}'
+    GROUP BY user_id, category
+    HAVING SUM(pay_order_cnt) > 0
+
+) t;

+ 47 - 0
manual/backfill/20260511_tdm_usr_tag_o_y2025.sql

@@ -0,0 +1,47 @@
+-- 作者:tianyu.chu
+-- 日期:2026-05-11
+-- 工单:(无)
+-- 目的:tdm_usr_tag_o y2025 凝固(kb/33 §3 tdm_usr_tag_o):
+--      扫 dws.dws_usr_user_trade_1d.dt BETWEEN '20250101' AND '20251231' +
+--      GROUP BY (user_id, category) 16 品类 × 金额+次数 = 32 tag_code +
+--      INSERT OVERWRITE PARTITION (dt='20251231') 单分区永久固定
+-- 状态:[待执行]
+-- 备注:跑一次后该 dt 分区永远不动(insert-only 凝固语义,Kimball 周期快照事实表标准);
+--      27-01-01 凝固 26 年时新落 manual/backfill/{date}_tdm_usr_tag_o_y2026.sql + dt='20261231',
+--      同表 tdm_usr_tag_o 多 dt 分区,不新建表;
+--      tag_code 命名 usr_pref_trade_{category}_{amt|cnt}_y2025;
+--      WHERE 'dt BETWEEN' STRING 字典序对 yyyyMMdd 格式安全;
+--      HAVING SUM > 0 过滤空消费用户,EAV 习惯不存空标签
+
+INSERT OVERWRITE TABLE tdm.tdm_usr_tag_o PARTITION (dt='20251231')
+SELECT entity_id, tag_code, tag_value, tag_type, confidence, etl_time FROM (
+
+    -- 16 品类 × 25 年金额
+    SELECT
+        user_id                                                  AS entity_id,
+        CONCAT('usr_pref_trade_', category, '_amt_y2025')        AS tag_code,
+        CAST(SUM(pay_amt_cny) AS STRING)                         AS tag_value,
+        'stat'                                                   AS tag_type,
+        CAST(1.0 AS DECIMAL(5,4))                                AS confidence,
+        CURRENT_TIMESTAMP()                                      AS etl_time
+    FROM dws.dws_usr_user_trade_1d
+    WHERE dt BETWEEN '20250101' AND '20251231'
+    GROUP BY user_id, category
+    HAVING SUM(pay_amt_cny) > 0
+
+    UNION ALL
+
+    -- 16 品类 × 25 年次数
+    SELECT
+        user_id                                                  AS entity_id,
+        CONCAT('usr_pref_trade_', category, '_cnt_y2025')        AS tag_code,
+        CAST(SUM(pay_order_cnt) AS STRING)                       AS tag_value,
+        'stat'                                                   AS tag_type,
+        CAST(1.0 AS DECIMAL(5,4))                                AS confidence,
+        CURRENT_TIMESTAMP()                                      AS etl_time
+    FROM dws.dws_usr_user_trade_1d
+    WHERE dt BETWEEN '20250101' AND '20251231'
+    GROUP BY user_id, category
+    HAVING SUM(pay_order_cnt) > 0
+
+) t;

+ 25 - 0
manual/ddl/tdm/usr/tdm_usr_tag_d_create.sql

@@ -0,0 +1,25 @@
+-- 作者:tianyu.chu
+-- 日期:2026-05-11
+-- 工单:(无)
+-- 目的:用户标签 日更长表(kb/33 §2 tdm_usr_tag_d)
+-- 状态:[草案]
+-- 备注:EAV 严守 kb/23 §2 / kb/33 §1.2 7 字段不扩(entity_id/tag_code/tag_value/tag_type/confidence/etl_time/dt);
+--      tag_type 枚举 attr/stat/rule,预留 algo 给将来 ML 标签;
+--      1 期含 7 属性 + 16 品类 × 4 偏好窗口(30d 金额+次数 / y{当年} 金额+次数) = 71 tag_code/分区;
+--      跑批 INSERT OVERWRITE PARTITION (dt='${dt}') 静态单分区;
+--      前置 DS DEPENDENT:dim_usr_user_ful_d.${dt} + dws_usr_user_trade_1d.${dt}
+
+DROP TABLE IF EXISTS tdm.tdm_usr_tag_d;
+
+CREATE EXTERNAL TABLE IF NOT EXISTS tdm.tdm_usr_tag_d (
+    entity_id    BIGINT         COMMENT '实体 id(用户场景=user_id)',
+    tag_code     STRING         COMMENT '标签编码(维度全 encode,命名见 kb/33 §4)',
+    tag_value    STRING         COMMENT '标签值,统一 STRING;数值标签下游 CAST(... AS DECIMAL)',
+    tag_type     STRING         COMMENT '标签类型 attr/stat/rule(预留 algo)',
+    confidence   DECIMAL(5,4)   COMMENT '置信度,规则标签 1.0;模型标签按模型输出',
+    etl_time     TIMESTAMP      COMMENT 'ETL 处理时间'
+)
+COMMENT '用户标签 日更长表'
+PARTITIONED BY (dt STRING)
+STORED AS ORC
+LOCATION '/user/hive/warehouse/tdm.db/tdm_usr_tag_d';

+ 25 - 0
manual/ddl/tdm/usr/tdm_usr_tag_o_create.sql

@@ -0,0 +1,25 @@
+-- 作者:tianyu.chu
+-- 日期:2026-05-11
+-- 工单:(无)
+-- 目的:用户标签 往年凝固长表(kb/33 §3 tdm_usr_tag_o)
+-- 状态:[草案]
+-- 备注:EAV 同 tdm_usr_tag_d 7 字段不扩(schema 完全一致);
+--      每凝固年 16 品类 × 金额+次数 = 32 tag_code/分区;
+--      按 Kimball 周期快照事实表 + 阿里 OneData 范式:单表 + insert-only 时间分区;
+--      1 期落 dt='20251231'(25 年凝固),27-01-01 凝固 26 年时新落 dt='20261231',以此类推;
+--      手动一次性灌入,不挂日调度;每个 dt 分区写入后永远不动
+
+DROP TABLE IF EXISTS tdm.tdm_usr_tag_o;
+
+CREATE EXTERNAL TABLE IF NOT EXISTS tdm.tdm_usr_tag_o (
+    entity_id    BIGINT         COMMENT '实体 id(用户场景=user_id)',
+    tag_code     STRING         COMMENT '标签编码(维度全 encode,命名见 kb/33 §4)',
+    tag_value    STRING         COMMENT '标签值,统一 STRING;数值标签下游 CAST(... AS DECIMAL)',
+    tag_type     STRING         COMMENT '标签类型 attr/stat/rule(预留 algo)',
+    confidence   DECIMAL(5,4)   COMMENT '置信度,规则标签 1.0;模型标签按模型输出',
+    etl_time     TIMESTAMP      COMMENT 'ETL 处理时间'
+)
+COMMENT '用户标签 往年凝固长表 单表多 dt 分区'
+PARTITIONED BY (dt STRING)
+STORED AS ORC
+LOCATION '/user/hive/warehouse/tdm.db/tdm_usr_tag_o';