|
|
@@ -50,8 +50,6 @@ WORKSPACE_DEFAULT = os.path.join(
|
|
|
ANCHOR_FIELDS = ('create_time', 'update_time')
|
|
|
# 抽样上限:TABLESAMPLE SYSTEM(1) 按存储页跳跃后再 LIMIT 截断
|
|
|
PROBE_SAMPLE_LIMIT = 1000
|
|
|
-# 锚点非空率告警阈值:低于此值认为业务方维护不全
|
|
|
-ANCHOR_NOTNULL_WARN_PCT = 95.0
|
|
|
|
|
|
|
|
|
def resolve_datasource(ds_ref):
|
|
|
@@ -106,19 +104,16 @@ def query_columns_full(conn, schema, table):
|
|
|
|
|
|
|
|
|
def probe_table(conn, schema, table, full_rows):
|
|
|
- """对表做行数估值 + 锚点抽样非空率 + 软删字段命中。
|
|
|
-
|
|
|
- - 行数:pg_class.reltuples 估值(autovacuum 后准,避免大表 count(*) 全扫)
|
|
|
- - 锚点字段:硬编码 create_time / update_time,存在性 + 抽样非空率
|
|
|
- (TABLESAMPLE SYSTEM(1) LIMIT 1000,按存储页跳跃采样,亿级表毫秒级)
|
|
|
- - 软删字段:从 full_rows 字段名筛 'del' 子串(不区分大小写)
|
|
|
-
|
|
|
- 返回 dict {
|
|
|
- 'reltuples': int,
|
|
|
- 'sample_total': int,
|
|
|
- 'anchor': {col: {'exists': bool, 'notnull': int or None}, ...},
|
|
|
- 'del_candidates': [str, ...],
|
|
|
- }
|
|
|
+ """对表做行数估值 + PK + 锚点抽样 + 近期 update_time + 主键序时间范围 + 软删命中。
|
|
|
+
|
|
|
+ - 行数:pg_class.reltuples 估值
|
|
|
+ - PK:单/复合/无 + 是否自增(pg_get_serial_sequence)
|
|
|
+ - 锚点:create_time / update_time 存在性 + 抽样非空率(TABLESAMPLE SYSTEM(1) LIMIT 1000)
|
|
|
+ - 近期 update_time 非空率:仅当单 PK + update_time 存在;ORDER BY pk DESC LIMIT 1000
|
|
|
+ - create_time 主键序范围:仅当单列自增 PK + create_time 存在;ORDER BY pk ASC/DESC LIMIT 1
|
|
|
+ - 软删:full_rows 筛 'del' 子串(不区分大小写)
|
|
|
+
|
|
|
+ 返回 dict 见 render_probe_md 引用字段。
|
|
|
"""
|
|
|
cur = conn.cursor()
|
|
|
|
|
|
@@ -131,6 +126,28 @@ def probe_table(conn, schema, table, full_rows):
|
|
|
row = cur.fetchone()
|
|
|
reltuples = int(row[0]) if row and row[0] is not None else 0
|
|
|
|
|
|
+ pk_cols = [r[1] for r in full_rows if r[4] == 'PK']
|
|
|
+ pk_auto_increment = False
|
|
|
+ if len(pk_cols) == 1:
|
|
|
+ # pg_get_serial_sequence 只识别 OWNED BY 关联的 sequence——
|
|
|
+ # 业务库手工建的 sequence 没 OWNED BY 标记会漏判,所以同时查 attidentity
|
|
|
+ # (PG 10+ IDENTITY 列)和 default 表达式(含 nextval 即视为自增)。
|
|
|
+ cur.execute("""
|
|
|
+ SELECT a.attidentity, pg_get_expr(ad.adbin, ad.adrelid)
|
|
|
+ FROM pg_attribute a
|
|
|
+ LEFT JOIN pg_attrdef ad ON ad.adrelid = a.attrelid AND ad.adnum = a.attnum
|
|
|
+ JOIN pg_class c ON c.oid = a.attrelid
|
|
|
+ JOIN pg_namespace n ON n.oid = c.relnamespace
|
|
|
+ WHERE n.nspname = %s AND c.relname = %s AND a.attname = %s
|
|
|
+ """, (schema, table, pk_cols[0]))
|
|
|
+ r = cur.fetchone()
|
|
|
+ if r:
|
|
|
+ attidentity, default_expr = r[0], r[1]
|
|
|
+ pk_auto_increment = (
|
|
|
+ attidentity in ('a', 'd')
|
|
|
+ or (default_expr is not None and 'nextval' in default_expr.lower())
|
|
|
+ )
|
|
|
+
|
|
|
field_names = {r[1] for r in full_rows}
|
|
|
anchor = {col: {'exists': col in field_names, 'notnull': None}
|
|
|
for col in ANCHOR_FIELDS}
|
|
|
@@ -151,12 +168,47 @@ def probe_table(conn, schema, table, full_rows):
|
|
|
for i, c in enumerate(present):
|
|
|
anchor[c]['notnull'] = int(result[i + 1])
|
|
|
|
|
|
+ recent_total = None
|
|
|
+ recent_update_notnull = None
|
|
|
+ if len(pk_cols) == 1 and anchor['update_time']['exists']:
|
|
|
+ sql = (
|
|
|
+ 'SELECT count(*), count("update_time") FROM '
|
|
|
+ '(SELECT update_time FROM "{schema}"."{table}" '
|
|
|
+ ' ORDER BY "{pk}" DESC LIMIT {lim}) AS sub'
|
|
|
+ ).format(schema=schema, table=table, pk=pk_cols[0],
|
|
|
+ lim=PROBE_SAMPLE_LIMIT)
|
|
|
+ cur.execute(sql)
|
|
|
+ result = cur.fetchone()
|
|
|
+ recent_total = int(result[0])
|
|
|
+ recent_update_notnull = int(result[1])
|
|
|
+
|
|
|
+ create_time_earliest = None
|
|
|
+ create_time_latest = None
|
|
|
+ if pk_auto_increment and anchor['create_time']['exists']:
|
|
|
+ sql = (
|
|
|
+ 'SELECT '
|
|
|
+ '(SELECT create_time FROM "{schema}"."{table}" '
|
|
|
+ ' ORDER BY "{pk}" ASC LIMIT 1), '
|
|
|
+ '(SELECT create_time FROM "{schema}"."{table}" '
|
|
|
+ ' ORDER BY "{pk}" DESC LIMIT 1)'
|
|
|
+ ).format(schema=schema, table=table, pk=pk_cols[0])
|
|
|
+ cur.execute(sql)
|
|
|
+ result = cur.fetchone()
|
|
|
+ create_time_earliest = result[0]
|
|
|
+ create_time_latest = result[1]
|
|
|
+
|
|
|
del_candidates = sorted(r[1] for r in full_rows if 'del' in r[1].lower())
|
|
|
|
|
|
return {
|
|
|
'reltuples': reltuples,
|
|
|
+ 'pk_cols': pk_cols,
|
|
|
+ 'pk_auto_increment': pk_auto_increment,
|
|
|
'sample_total': sample_total,
|
|
|
'anchor': anchor,
|
|
|
+ 'recent_total': recent_total,
|
|
|
+ 'recent_update_notnull': recent_update_notnull,
|
|
|
+ 'create_time_earliest': create_time_earliest,
|
|
|
+ 'create_time_latest': create_time_latest,
|
|
|
'del_candidates': del_candidates,
|
|
|
}
|
|
|
|
|
|
@@ -165,25 +217,48 @@ def render_probe_md(stats):
|
|
|
"""渲染探查段 markdown。"""
|
|
|
lines = ['### 探查', '']
|
|
|
lines.append('- 行数估值(pg_class.reltuples):{:,}'.format(stats['reltuples']))
|
|
|
+
|
|
|
+ pk_cols = stats['pk_cols']
|
|
|
+ if not pk_cols:
|
|
|
+ pk_desc = '无(DataX channel 无法并行)'
|
|
|
+ elif len(pk_cols) > 1:
|
|
|
+ pk_desc = '复合 ({}) (DataX splitPk 不支持复合,退串行)'.format(
|
|
|
+ ', '.join('`{}`'.format(c) for c in pk_cols))
|
|
|
+ elif stats['pk_auto_increment']:
|
|
|
+ pk_desc = '`{}`(自增)'.format(pk_cols[0])
|
|
|
+ else:
|
|
|
+ pk_desc = '`{}`(非自增,DataX channel 切分分布可能不均)'.format(pk_cols[0])
|
|
|
+ lines.append('- 主键:' + pk_desc)
|
|
|
+
|
|
|
lines.append('- 锚点字段:')
|
|
|
total = stats['sample_total']
|
|
|
for col in ANCHOR_FIELDS:
|
|
|
s = stats['anchor'][col]
|
|
|
if not s['exists']:
|
|
|
- lines.append(' - `{}`:**缺失** ⚠'.format(col))
|
|
|
- elif total > 0 and s['notnull'] is not None:
|
|
|
- nn = s['notnull']
|
|
|
- pct = 100.0 * nn / total
|
|
|
- warn = ' ⚠' if pct < ANCHOR_NOTNULL_WARN_PCT else ''
|
|
|
- lines.append(' - `{}`:存在;抽样 {} 行非空 {} ({:.1f}%){}'.format(
|
|
|
- col, total, nn, pct, warn))
|
|
|
+ lines.append(' - `{}`:缺失'.format(col))
|
|
|
+ continue
|
|
|
+ if total > 0 and s['notnull'] is not None:
|
|
|
+ pct = 100.0 * s['notnull'] / total
|
|
|
+ base = '`{}`:存在;整体非空率 {:.1f}% ({}/{} 抽样)'.format(
|
|
|
+ col, pct, s['notnull'], total)
|
|
|
else:
|
|
|
- lines.append(' - `{}`:存在;抽样无数据'.format(col))
|
|
|
+ base = '`{}`:存在;抽样无数据'.format(col)
|
|
|
+ if col == 'create_time' and stats['create_time_earliest']:
|
|
|
+ base += ';按主键序范围 {} ~ {}'.format(
|
|
|
+ stats['create_time_earliest'], stats['create_time_latest'])
|
|
|
+ lines.append(' - ' + base)
|
|
|
+ if col == 'update_time' and stats['recent_total'] is not None:
|
|
|
+ rt = stats['recent_total']
|
|
|
+ rnn = stats['recent_update_notnull']
|
|
|
+ rpct = 100.0 * rnn / rt if rt else 0.0
|
|
|
+ lines.append(' - 近期非空率 {:.1f}% ({}/{} 最近 1000 行)'.format(
|
|
|
+ rpct, rnn, rt))
|
|
|
+
|
|
|
if stats['del_candidates']:
|
|
|
lines.append('- 软删字段(含 `del` 子串):' + ', '.join(
|
|
|
'`{}`'.format(c) for c in stats['del_candidates']))
|
|
|
else:
|
|
|
- lines.append('- 软删字段(含 `del` 子串):**未命中** ⚠')
|
|
|
+ lines.append('- 软删字段(含 `del` 子串):未命中')
|
|
|
return '\n'.join(lines) + '\n'
|
|
|
|
|
|
|