Просмотр исходного кода

refactor(dim/usr): sche 改业界主流模式 B(NOT IN + UNION ALL + CASE WHEN 分组判断)

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
tianyu.chu 2 дней назад
Родитель
Сommit
5d09add
1 измененных файлов с 90 добавлено и 50 удалено
  1. 90 50
      jobs/dim/usr/dim_usr_user_ful_d.sql

+ 90 - 50
jobs/dim/usr/dim_usr_user_ful_d.sql

@@ -1,14 +1,16 @@
 -- 作者:tianyu.chu
 -- 日期:2026-05-09
 -- 工单:(无)
--- 目的:dim_usr_user_ful_d 日常增量(kb/28 §1.3):
---      new = 今日 ods (base / cert) 增量;old = dim.dt=${pdt} 昨日全量;
---      old FULL OUTER JOIN today_base FULL OUTER JOIN today_cert,字段 COALESCE(today, old) 优先取今日
+-- 目的:dim_usr_user_ful_d 日常增量(kb/28 §1.3,业界主流模式 B):
+--      (a) 找今日变更 user_id (base 或 cert 任一变);
+--      (b) 这些 user 重 join 形成 today_rebuilt:base 字段 CASE WHEN bu IS NOT NULL THEN bu.x ELSE o.x,cert 字段同理;
+--      (c) 昨日 dim 中今日没变的 user (NOT IN today_changed) 直接保留;
+--      (d) UNION ALL 写入 dim dt=${dt} 单分区
 -- 状态:[草案]
 -- 备注:sched=T,${dt}=T-1,${pdt}=T-2;
---      ods 增量同步只含当天有 update 的 user,所以今日 ods 即"今日变更行";
---      base 字段从 today_base 取(今日没变就 NULL→fallback old);cert 字段从 today_cert 取同理;
+--      base / cert 整组判断"是否今日变更",避免字段级 COALESCE 在"业务库主动置 NULL"场景下错误兜底昨日值;
 --      cert_info update_time 几乎全空,ORDER BY 用 COALESCE(update_time, create_time);
+--      性能:dim 大表只 scan + broadcast NOT IN 剪枝,不参与 shuffle;today_changed user 数量小 broadcast 有效;
 --      前置 DS DEPENDENT:ods.base_user.dt=${dt} + ods.cert_info.dt=${dt} + dim.dt=${pdt}
 
 INSERT OVERWRITE TABLE dim.dim_usr_user_ful_d PARTITION (dt='${dt}')
@@ -38,50 +40,88 @@ today_cert AS (
     ) t
     WHERE t.rn = 1
 ),
-old_dim AS (
-    SELECT * FROM dim.dim_usr_user_ful_d WHERE dt = '${pdt}'
+today_changed AS (
+    SELECT id AS user_id FROM today_base
+    UNION
+    SELECT user_id FROM today_cert
+),
+today_rebuilt AS (
+    SELECT
+        COALESCE(bu.id, ci.user_id, o.user_id)                                                AS user_id,
+        CASE WHEN bu.id IS NOT NULL THEN bu.appid                ELSE o.appid                END AS appid,
+        CASE WHEN bu.id IS NOT NULL THEN bu.username             ELSE o.username             END AS username,
+        CASE WHEN bu.id IS NOT NULL THEN bu.code                 ELSE o.code                 END AS code,
+        CASE WHEN ci.user_id IS NOT NULL THEN ci.cert_sex        ELSE o.sex_cert             END AS sex_cert,
+        CASE WHEN ci.user_id IS NOT NULL THEN ci.cert_birthday   ELSE o.birthday_cert        END AS birthday_cert,
+        CASE WHEN ci.user_id IS NOT NULL THEN ci.cert_province   ELSE o.cert_province        END AS cert_province,
+        CASE WHEN ci.user_id IS NOT NULL THEN ci.cert_city       ELSE o.cert_city            END AS cert_city,
+        CASE WHEN ci.user_id IS NOT NULL OR o.is_cert = TRUE THEN TRUE ELSE FALSE END        AS is_cert,
+        CASE WHEN bu.id IS NOT NULL THEN bu.id_card              ELSE o.id_card              END AS id_card,
+        CASE WHEN bu.id IS NOT NULL THEN bu.face_verify          ELSE o.face_verify          END AS face_verify,
+        CASE WHEN bu.id IS NOT NULL THEN bu.cancel_verify_num    ELSE o.cancel_verify_num    END AS cancel_verify_num,
+        CASE WHEN bu.id IS NOT NULL THEN bu.register_channel     ELSE o.register_channel     END AS register_channel,
+        CASE WHEN bu.id IS NOT NULL THEN bu.register_addr        ELSE o.register_addr        END AS register_addr,
+        CASE WHEN bu.id IS NOT NULL THEN bu.register_ip_addr     ELSE o.register_ip_addr     END AS register_ip_addr,
+        CASE WHEN bu.id IS NOT NULL THEN bu.create_time          ELSE o.reg_create_time      END AS reg_create_time,
+        CASE WHEN bu.id IS NOT NULL THEN bu.login_addr           ELSE o.login_addr           END AS login_addr,
+        CASE WHEN bu.id IS NOT NULL THEN bu.login_ip_addr        ELSE o.login_ip_addr        END AS login_ip_addr,
+        CASE WHEN bu.id IS NOT NULL THEN bu.level                ELSE o.level                END AS level,
+        CASE WHEN bu.id IS NOT NULL THEN bu.member_level         ELSE o.member_level         END AS member_level,
+        CASE WHEN bu.id IS NOT NULL THEN bu.member_name          ELSE o.member_name          END AS member_name,
+        CASE WHEN bu.id IS NOT NULL THEN bu.growth_num           ELSE o.growth_num           END AS growth_num,
+        CASE WHEN bu.id IS NOT NULL THEN bu.current_month_growth ELSE o.current_month_growth END AS current_month_growth,
+        CASE WHEN bu.id IS NOT NULL THEN bu.member_keep_growth   ELSE o.member_keep_growth   END AS member_keep_growth,
+        CASE WHEN bu.id IS NOT NULL THEN bu.member_init_flag     ELSE o.member_init_flag     END AS member_init_flag,
+        CASE WHEN bu.id IS NOT NULL THEN bu.point                ELSE o.point                END AS point,
+        CASE WHEN bu.id IS NOT NULL THEN bu.consume_amount       ELSE o.consume_amount_cny   END AS consume_amount_cny,
+        CASE WHEN bu.id IS NOT NULL THEN bu.order_total_num      ELSE o.order_total_num      END AS order_total_num,
+        CASE WHEN bu.id IS NOT NULL THEN bu.status               ELSE o.status               END AS status,
+        CASE WHEN bu.id IS NOT NULL THEN bu.blacklist            ELSE o.blacklist            END AS blacklist,
+        CASE WHEN bu.id IS NOT NULL THEN bu.refuse_pick_up       ELSE o.refuse_pick_up       END AS refuse_pick_up,
+        CASE WHEN bu.id IS NOT NULL THEN bu.notify_flag          ELSE o.notify_flag          END AS notify_flag,
+        CASE WHEN bu.id IS NOT NULL THEN bu.open_invoice         ELSE o.open_invoice         END AS open_invoice,
+        CASE WHEN bu.id IS NOT NULL THEN bu.open_psd             ELSE o.open_psd             END AS open_psd,
+        CASE WHEN bu.id IS NOT NULL THEN bu.daily_limit          ELSE o.daily_limit          END AS daily_limit,
+        CASE WHEN bu.id IS NOT NULL THEN bu.weekly_limit         ELSE o.weekly_limit         END AS weekly_limit,
+        CASE WHEN bu.id IS NOT NULL THEN bu.monthly_limit        ELSE o.monthly_limit        END AS monthly_limit,
+        CASE WHEN bu.id IS NOT NULL THEN bu.update_time          ELSE o.last_update_time     END AS last_update_time,
+        CASE WHEN bu.id IS NOT NULL THEN bu.is_deleted           ELSE o.is_deleted           END AS is_deleted,
+        CURRENT_TIMESTAMP()                                                                    AS etl_time
+    FROM today_base bu
+    FULL OUTER JOIN today_cert ci ON bu.id = ci.user_id
+    LEFT JOIN dim.dim_usr_user_ful_d o
+        ON COALESCE(bu.id, ci.user_id) = o.user_id AND o.dt = '${pdt}'
 )
 SELECT
-    COALESCE(bu.id, ci.user_id, o.user_id)                    AS user_id,
-    COALESCE(bu.appid, o.appid)                               AS appid,
-    COALESCE(bu.username, o.username)                         AS username,
-    COALESCE(bu.code, o.code)                                 AS code,
-    COALESCE(ci.cert_sex, o.sex_cert)                         AS sex_cert,
-    COALESCE(ci.cert_birthday, o.birthday_cert)               AS birthday_cert,
-    COALESCE(ci.cert_province, o.cert_province)               AS cert_province,
-    COALESCE(ci.cert_city, o.cert_city)                       AS cert_city,
-    CASE WHEN ci.user_id IS NOT NULL OR o.is_cert = TRUE THEN TRUE ELSE FALSE END AS is_cert,
-    COALESCE(bu.id_card, o.id_card)                           AS id_card,
-    COALESCE(bu.face_verify, o.face_verify)                   AS face_verify,
-    COALESCE(bu.cancel_verify_num, o.cancel_verify_num)       AS cancel_verify_num,
-    COALESCE(bu.register_channel, o.register_channel)         AS register_channel,
-    COALESCE(bu.register_addr, o.register_addr)               AS register_addr,
-    COALESCE(bu.register_ip_addr, o.register_ip_addr)         AS register_ip_addr,
-    COALESCE(bu.create_time, o.reg_create_time)               AS reg_create_time,
-    COALESCE(bu.login_addr, o.login_addr)                     AS login_addr,
-    COALESCE(bu.login_ip_addr, o.login_ip_addr)               AS login_ip_addr,
-    COALESCE(bu.level, o.level)                               AS level,
-    COALESCE(bu.member_level, o.member_level)                 AS member_level,
-    COALESCE(bu.member_name, o.member_name)                   AS member_name,
-    COALESCE(bu.growth_num, o.growth_num)                     AS growth_num,
-    COALESCE(bu.current_month_growth, o.current_month_growth) AS current_month_growth,
-    COALESCE(bu.member_keep_growth, o.member_keep_growth)     AS member_keep_growth,
-    COALESCE(bu.member_init_flag, o.member_init_flag)         AS member_init_flag,
-    COALESCE(bu.point, o.point)                               AS point,
-    COALESCE(bu.consume_amount, o.consume_amount_cny)         AS consume_amount_cny,
-    COALESCE(bu.order_total_num, o.order_total_num)           AS order_total_num,
-    COALESCE(bu.status, o.status)                             AS status,
-    COALESCE(bu.blacklist, o.blacklist)                       AS blacklist,
-    COALESCE(bu.refuse_pick_up, o.refuse_pick_up)             AS refuse_pick_up,
-    COALESCE(bu.notify_flag, o.notify_flag)                   AS notify_flag,
-    COALESCE(bu.open_invoice, o.open_invoice)                 AS open_invoice,
-    COALESCE(bu.open_psd, o.open_psd)                         AS open_psd,
-    COALESCE(bu.daily_limit, o.daily_limit)                   AS daily_limit,
-    COALESCE(bu.weekly_limit, o.weekly_limit)                 AS weekly_limit,
-    COALESCE(bu.monthly_limit, o.monthly_limit)               AS monthly_limit,
-    COALESCE(bu.update_time, o.last_update_time)              AS last_update_time,
-    COALESCE(bu.is_deleted, o.is_deleted)                     AS is_deleted,
-    CURRENT_TIMESTAMP()                                       AS etl_time
-FROM old_dim o
-FULL OUTER JOIN today_base bu ON o.user_id = bu.id
-FULL OUTER JOIN today_cert ci ON COALESCE(o.user_id, bu.id) = ci.user_id;
+    user_id, appid, username, code,
+    sex_cert, birthday_cert, cert_province, cert_city, is_cert,
+    id_card, face_verify, cancel_verify_num,
+    register_channel, register_addr, register_ip_addr, reg_create_time,
+    login_addr, login_ip_addr,
+    level, member_level, member_name,
+    growth_num, current_month_growth, member_keep_growth, member_init_flag,
+    point, consume_amount_cny, order_total_num,
+    status, blacklist,
+    refuse_pick_up, notify_flag, open_invoice, open_psd,
+    daily_limit, weekly_limit, monthly_limit,
+    last_update_time, is_deleted, etl_time
+FROM today_rebuilt
+
+UNION ALL
+
+SELECT
+    user_id, appid, username, code,
+    sex_cert, birthday_cert, cert_province, cert_city, is_cert,
+    id_card, face_verify, cancel_verify_num,
+    register_channel, register_addr, register_ip_addr, reg_create_time,
+    login_addr, login_ip_addr,
+    level, member_level, member_name,
+    growth_num, current_month_growth, member_keep_growth, member_init_flag,
+    point, consume_amount_cny, order_total_num,
+    status, blacklist,
+    refuse_pick_up, notify_flag, open_invoice, open_psd,
+    daily_limit, weekly_limit, monthly_limit,
+    last_update_time, is_deleted, etl_time
+FROM dim.dim_usr_user_ful_d
+WHERE dt = '${pdt}'
+  AND user_id NOT IN (SELECT user_id FROM today_changed);