|
@@ -11,44 +11,85 @@ import html
|
|
|
import json
|
|
import json
|
|
|
import random
|
|
import random
|
|
|
import re
|
|
import re
|
|
|
-import traceback
|
|
|
|
|
-from collections import Counter
|
|
|
|
|
|
|
+from ast import literal_eval
|
|
|
from datetime import datetime
|
|
from datetime import datetime
|
|
|
from typing import Dict, List, Union
|
|
from typing import Dict, List, Union
|
|
|
|
|
|
|
|
from pyspark.sql.functions import udf
|
|
from pyspark.sql.functions import udf
|
|
|
from pyspark.sql.types import (
|
|
from pyspark.sql.types import (
|
|
|
- ArrayType, BooleanType, FloatType, IntegerType, LongType, MapType,
|
|
|
|
|
- StringType, StructField, StructType,
|
|
|
|
|
|
|
+ ArrayType, BooleanType, FloatType, LongType, MapType, StringType,
|
|
|
)
|
|
)
|
|
|
|
|
|
|
|
from dw_base.utils.datetime_utils import parse_datetime
|
|
from dw_base.utils.datetime_utils import parse_datetime
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
+def _load_json_or_default(data, default=None):
|
|
|
|
|
+ """优先按 JSON 解析,失败时返回默认值。"""
|
|
|
|
|
+ try:
|
|
|
|
|
+ return json.loads(data)
|
|
|
|
|
+ except (TypeError, ValueError):
|
|
|
|
|
+ return default
|
|
|
|
|
+
|
|
|
|
|
+
|
|
|
|
|
+def _load_json_or_literal(data, default=None):
|
|
|
|
|
+ """先按 JSON 解析,失败后再按 Python 字面量兜底解析。"""
|
|
|
|
|
+ parsed = _load_json_or_default(data, default=None)
|
|
|
|
|
+ if parsed is not None:
|
|
|
|
|
+ return parsed
|
|
|
|
|
+ try:
|
|
|
|
|
+ return literal_eval(data)
|
|
|
|
|
+ except (ValueError, SyntaxError, TypeError):
|
|
|
|
|
+ return default
|
|
|
|
|
+
|
|
|
|
|
+
|
|
|
|
|
+def _dedupe_keep_order(values: List) -> List:
|
|
|
|
|
+ """按原始顺序去重。"""
|
|
|
|
|
+ result = []
|
|
|
|
|
+ for value in values:
|
|
|
|
|
+ if value not in result:
|
|
|
|
|
+ result.append(value)
|
|
|
|
|
+ return result
|
|
|
|
|
+
|
|
|
|
|
+
|
|
|
|
|
+def _merge_non_empty_values(*arrays: List) -> List[str]:
|
|
|
|
|
+ """合并多个数组,并过滤 None 与空字符串。"""
|
|
|
|
|
+ result = set()
|
|
|
|
|
+ for array in arrays:
|
|
|
|
|
+ if array is None:
|
|
|
|
|
+ continue
|
|
|
|
|
+ for item in array:
|
|
|
|
|
+ if item is not None and item != "":
|
|
|
|
|
+ result.add(item)
|
|
|
|
|
+ return list(result)
|
|
|
|
|
+
|
|
|
|
|
+
|
|
|
# ==================== JSON ====================
|
|
# ==================== JSON ====================
|
|
|
|
|
|
|
|
|
|
+# UDF-01 JSON校验:判断输入是否为合法 JSON 字符串。
|
|
|
@udf(returnType=BooleanType())
|
|
@udf(returnType=BooleanType())
|
|
|
def is_json(data) -> bool:
|
|
def is_json(data) -> bool:
|
|
|
|
|
+ """判断输入是否为合法 JSON 字符串。"""
|
|
|
try:
|
|
try:
|
|
|
json.loads(data)
|
|
json.loads(data)
|
|
|
- except:
|
|
|
|
|
|
|
+ except (TypeError, ValueError):
|
|
|
return False
|
|
return False
|
|
|
return True
|
|
return True
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
+# UDF-02 JSON取键:提取 JSON object 的 key 列表。
|
|
|
@udf(returnType=ArrayType(StringType()))
|
|
@udf(returnType=ArrayType(StringType()))
|
|
|
def json_object_keys(json_str: str) -> List[str]:
|
|
def json_object_keys(json_str: str) -> List[str]:
|
|
|
|
|
+ """提取 JSON object 的 key 列表。"""
|
|
|
if not json_str:
|
|
if not json_str:
|
|
|
return None
|
|
return None
|
|
|
- try:
|
|
|
|
|
- json_dict = json.loads(json_str) # type:dict
|
|
|
|
|
- return [k for k in json_dict.keys()]
|
|
|
|
|
- except:
|
|
|
|
|
|
|
+ json_dict = _load_json_or_default(json_str, default=None) # type:dict
|
|
|
|
|
+ if not isinstance(json_dict, dict):
|
|
|
return None
|
|
return None
|
|
|
|
|
+ return [k for k in json_dict.keys()]
|
|
|
|
|
|
|
|
|
|
|
|
|
def flatten_json(json_str: str, reserve_parent: bool = True) -> str:
|
|
def flatten_json(json_str: str, reserve_parent: bool = True) -> str:
|
|
|
- """展平 json,reserve_parent 控制是否保留父 key"""
|
|
|
|
|
|
|
+ """展平 JSON 字符串,`reserve_parent` 控制是否保留父级 key。"""
|
|
|
|
|
|
|
|
def flatten_json_node(parent, json_element) -> Union[float, int, str, Dict, List]:
|
|
def flatten_json_node(parent, json_element) -> Union[float, int, str, Dict, List]:
|
|
|
if isinstance(json_element, dict):
|
|
if isinstance(json_element, dict):
|
|
@@ -78,13 +119,12 @@ def flatten_json(json_str: str, reserve_parent: bool = True) -> str:
|
|
|
json_node = json.loads(json_str)
|
|
json_node = json.loads(json_str)
|
|
|
flattened_json = flatten_json_node(None, json_node)
|
|
flattened_json = flatten_json_node(None, json_node)
|
|
|
return json.dumps(flattened_json, ensure_ascii=False)
|
|
return json.dumps(flattened_json, ensure_ascii=False)
|
|
|
- except Exception as e:
|
|
|
|
|
- traceback.format_exc(e)
|
|
|
|
|
|
|
+ except (TypeError, ValueError):
|
|
|
return json_str
|
|
return json_str
|
|
|
|
|
|
|
|
|
|
|
|
|
def remove_empty_key(info):
|
|
def remove_empty_key(info):
|
|
|
- """递归删除 json 中 value 为空的 key"""
|
|
|
|
|
|
|
+ """递归删除 JSON 中 value 为空的 key。"""
|
|
|
json_info = json.loads(info)
|
|
json_info = json.loads(info)
|
|
|
|
|
|
|
|
def internal_remove(json_info):
|
|
def internal_remove(json_info):
|
|
@@ -118,20 +158,17 @@ def remove_empty_key(info):
|
|
|
|
|
|
|
|
|
|
|
|
|
def append_to_json_array(json_array_string: str, new_element, remove_duplicate: bool = False) -> str:
|
|
def append_to_json_array(json_array_string: str, new_element, remove_duplicate: bool = False) -> str:
|
|
|
- """向 JSON array 追加元素,可选去重"""
|
|
|
|
|
|
|
+ """向 JSON array 末尾追加元素,可选去重。"""
|
|
|
if not new_element:
|
|
if not new_element:
|
|
|
return json_array_string
|
|
return json_array_string
|
|
|
if not json_array_string:
|
|
if not json_array_string:
|
|
|
return json.dumps([new_element], ensure_ascii=False)
|
|
return json.dumps([new_element], ensure_ascii=False)
|
|
|
- json_array = json.loads(json_array_string) # type: list
|
|
|
|
|
|
|
+ json_array = _load_json_or_default(json_array_string, default=None) # type: list
|
|
|
|
|
+ if not isinstance(json_array, list):
|
|
|
|
|
+ return json_array_string
|
|
|
json_array.append(new_element)
|
|
json_array.append(new_element)
|
|
|
if remove_duplicate is True:
|
|
if remove_duplicate is True:
|
|
|
- result = []
|
|
|
|
|
- for elem in json_array:
|
|
|
|
|
- if result.__contains__(elem):
|
|
|
|
|
- continue
|
|
|
|
|
- result.append(elem)
|
|
|
|
|
- return json.dumps(result, ensure_ascii=False)
|
|
|
|
|
|
|
+ return json.dumps(_dedupe_keep_order(json_array), ensure_ascii=False)
|
|
|
return json.dumps(json_array, ensure_ascii=False)
|
|
return json.dumps(json_array, ensure_ascii=False)
|
|
|
|
|
|
|
|
|
|
|
|
@@ -139,7 +176,7 @@ def json_array_subset(json_array_string: str,
|
|
|
subset_fields: Union[List, str],
|
|
subset_fields: Union[List, str],
|
|
|
as_list: bool = False,
|
|
as_list: bool = False,
|
|
|
skip_null: bool = False) -> str:
|
|
skip_null: bool = False) -> str:
|
|
|
- """按字段提取 json object array 的子集"""
|
|
|
|
|
|
|
+ """按字段提取 JSON object array 的子集。"""
|
|
|
if not json_array_string:
|
|
if not json_array_string:
|
|
|
return None
|
|
return None
|
|
|
if not subset_fields:
|
|
if not subset_fields:
|
|
@@ -150,10 +187,9 @@ def json_array_subset(json_array_string: str,
|
|
|
subset_field_list = subset_fields
|
|
subset_field_list = subset_fields
|
|
|
if len(subset_field_list) == 0:
|
|
if len(subset_field_list) == 0:
|
|
|
return None
|
|
return None
|
|
|
- try:
|
|
|
|
|
- json_array = json.loads(json_array_string)
|
|
|
|
|
- except:
|
|
|
|
|
- json_array = eval(json_array_string)
|
|
|
|
|
|
|
+ json_array = _load_json_or_literal(json_array_string, default=None)
|
|
|
|
|
+ if not isinstance(json_array, list):
|
|
|
|
|
+ return None
|
|
|
list_subset = []
|
|
list_subset = []
|
|
|
if len(subset_field_list) == 1 and as_list:
|
|
if len(subset_field_list) == 1 and as_list:
|
|
|
only_subset_field = subset_field_list[0]
|
|
only_subset_field = subset_field_list[0]
|
|
@@ -174,26 +210,12 @@ def json_array_subset(json_array_string: str,
|
|
|
return json.dumps(list_subset, ensure_ascii=False)
|
|
return json.dumps(list_subset, ensure_ascii=False)
|
|
|
|
|
|
|
|
|
|
|
|
|
-@udf(returnType=ArrayType(StructType([
|
|
|
|
|
- StructField("idx", IntegerType(), False),
|
|
|
|
|
- StructField("obj", StringType(), False),
|
|
|
|
|
-])))
|
|
|
|
|
-def parse_jsonarr_to_arr(s: str):
|
|
|
|
|
- return [(i + 1, json.dumps(obj)) for i, obj in enumerate(json.loads(s))]
|
|
|
|
|
-
|
|
|
|
|
-
|
|
|
|
|
-@udf(returnType=ArrayType(StructType([
|
|
|
|
|
- StructField("idx", IntegerType(), False),
|
|
|
|
|
- StructField("obj", StringType(), False),
|
|
|
|
|
-])))
|
|
|
|
|
-def parse_jsonarr_to_strarr(s: str):
|
|
|
|
|
- return [(i + 1, obj) for i, obj in enumerate(json.loads(s))]
|
|
|
|
|
-
|
|
|
|
|
-
|
|
|
|
|
# ==================== ARRAY ====================
|
|
# ==================== ARRAY ====================
|
|
|
|
|
|
|
|
|
|
+# UDF-21 数组交集:计算两个数组的交集。
|
|
|
@udf(returnType=ArrayType(StringType()))
|
|
@udf(returnType=ArrayType(StringType()))
|
|
|
def array_intersect(arr1, arr2):
|
|
def array_intersect(arr1, arr2):
|
|
|
|
|
+ """计算两个数组的交集。"""
|
|
|
return list(set(arr1) & set(arr2))
|
|
return list(set(arr1) & set(arr2))
|
|
|
|
|
|
|
|
|
|
|
|
@@ -201,6 +223,7 @@ def array_append(array: List, new_element,
|
|
|
ignore_null: bool = False,
|
|
ignore_null: bool = False,
|
|
|
remove_duplicate: bool = False,
|
|
remove_duplicate: bool = False,
|
|
|
need_sort: bool = False) -> List:
|
|
need_sort: bool = False) -> List:
|
|
|
|
|
+ """向数组追加元素,可按现有规则控制空值、去重和排序。"""
|
|
|
if not array or len(array) == 0:
|
|
if not array or len(array) == 0:
|
|
|
if new_element or ignore_null is not True:
|
|
if new_element or ignore_null is not True:
|
|
|
return [new_element]
|
|
return [new_element]
|
|
@@ -217,76 +240,28 @@ def array_append(array: List, new_element,
|
|
|
return array
|
|
return array
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
+# UDF-22 数组切片:按起止下标截取数组。
|
|
|
@udf(ArrayType(StringType()))
|
|
@udf(ArrayType(StringType()))
|
|
|
def array_slice(input_array, start, end):
|
|
def array_slice(input_array, start, end):
|
|
|
|
|
+ """截取数组切片,行为与 Python 切片一致。"""
|
|
|
if input_array:
|
|
if input_array:
|
|
|
return input_array[start:end]
|
|
return input_array[start:end]
|
|
|
return []
|
|
return []
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
+# UDF-23 数组合并:合并二维数组,并过滤 None 与空字符串。
|
|
|
@udf(returnType=ArrayType(StringType()))
|
|
@udf(returnType=ArrayType(StringType()))
|
|
|
def merge_list(arr_list: List):
|
|
def merge_list(arr_list: List):
|
|
|
- res = set()
|
|
|
|
|
- for e in arr_list:
|
|
|
|
|
- if e is not None:
|
|
|
|
|
- for i in e:
|
|
|
|
|
- if i is not None and i != "":
|
|
|
|
|
- res.add(i)
|
|
|
|
|
- return list(res)
|
|
|
|
|
-
|
|
|
|
|
-
|
|
|
|
|
-@udf(returnType=ArrayType(StringType()))
|
|
|
|
|
-def merge_source(incr_source: List, old_source: List):
|
|
|
|
|
- res = set()
|
|
|
|
|
- if incr_source is not None:
|
|
|
|
|
- for i in incr_source:
|
|
|
|
|
- if i is not None and i != "":
|
|
|
|
|
- res.add(i)
|
|
|
|
|
- if old_source is not None:
|
|
|
|
|
- for i in old_source:
|
|
|
|
|
- if i is not None and i != "":
|
|
|
|
|
- res.add(i)
|
|
|
|
|
- return list(res)
|
|
|
|
|
-
|
|
|
|
|
-
|
|
|
|
|
-@udf(returnType=StructType([
|
|
|
|
|
- StructField("k", ArrayType(StringType()), False),
|
|
|
|
|
- StructField("kv", StringType()),
|
|
|
|
|
-]))
|
|
|
|
|
-def parse_arr_and_count(arr, tag: str, return_count: int = -1):
|
|
|
|
|
- ele_cnt_dict = Counter(arr)
|
|
|
|
|
- json_list = sorted([{"code": key, "num": value} for key, value in ele_cnt_dict.items()], key=lambda x: x["num"], reverse=True)
|
|
|
|
|
- if return_count < 0:
|
|
|
|
|
- return [obj['code'] for obj in json_list], ",".join(['{' + f'{i["code"]},{tag}:{i["num"]}' + '}' for i in json_list])
|
|
|
|
|
- list_len = len(json_list)
|
|
|
|
|
- index = list_len if return_count >= list_len else return_count
|
|
|
|
|
- return [obj['code'] for obj in json_list][:index], ",".join(['{' + f'{i["code"]},{tag}:{i["num"]}' + '}' for i in json_list[:index]])
|
|
|
|
|
-
|
|
|
|
|
-
|
|
|
|
|
-@udf(returnType=StructType([
|
|
|
|
|
- StructField("sum", FloatType(), False),
|
|
|
|
|
- StructField("list", StringType()),
|
|
|
|
|
-]))
|
|
|
|
|
-def parse_arr_and_sum(struct_arr, tag: str):
|
|
|
|
|
- sum_dict = {}
|
|
|
|
|
- for s in struct_arr:
|
|
|
|
|
- key = s[0]
|
|
|
|
|
- value: float = s[1]
|
|
|
|
|
- if key not in sum_dict:
|
|
|
|
|
- sum_dict[key] = 0.0
|
|
|
|
|
- if value is not None:
|
|
|
|
|
- sum_dict[key] += value
|
|
|
|
|
- json_list = sorted([{"code": key, "num": value} for key, value in sum_dict.items()], key=lambda x: x["num"], reverse=True)
|
|
|
|
|
- total = 0.0
|
|
|
|
|
- for obj in json_list:
|
|
|
|
|
- total += obj["num"]
|
|
|
|
|
- return round(total, 2), ",".join(['{' + f'{i["code"]},{tag}:{round(i["num"], 2)}' + '}' for i in json_list])
|
|
|
|
|
|
|
+ """合并二维数组,并过滤 None 与空字符串。"""
|
|
|
|
|
+ return _merge_non_empty_values(*(arr_list or []))
|
|
|
|
|
|
|
|
|
|
|
|
|
# ==================== STRING ====================
|
|
# ==================== STRING ====================
|
|
|
|
|
|
|
|
|
|
+# UDF-31 中文检测:判断字符串中是否包含中文字符。
|
|
|
@udf(returnType=BooleanType())
|
|
@udf(returnType=BooleanType())
|
|
|
def has_chinese(datum: str) -> bool:
|
|
def has_chinese(datum: str) -> bool:
|
|
|
|
|
+ """判断字符串中是否包含中文字符。"""
|
|
|
if datum:
|
|
if datum:
|
|
|
pattern = re.compile(u'[\u4e00-\u9fa5]')
|
|
pattern = re.compile(u'[\u4e00-\u9fa5]')
|
|
|
if pattern.search(datum):
|
|
if pattern.search(datum):
|
|
@@ -294,64 +269,66 @@ def has_chinese(datum: str) -> bool:
|
|
|
return False
|
|
return False
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
+# UDF-32 相似度计算:计算两个字符串的快速相似度。
|
|
|
@udf(returnType=FloatType())
|
|
@udf(returnType=FloatType())
|
|
|
def similarity(left: str, right: str) -> float:
|
|
def similarity(left: str, right: str) -> float:
|
|
|
|
|
+ """计算两个字符串的快速相似度。"""
|
|
|
return difflib.SequenceMatcher(None, left, right).quick_ratio()
|
|
return difflib.SequenceMatcher(None, left, right).quick_ratio()
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
+# UDF-33 正则全提取:提取正则表达式的全部匹配结果。
|
|
|
@udf(returnType=ArrayType(StringType()))
|
|
@udf(returnType=ArrayType(StringType()))
|
|
|
def regexp_extract_all(col: str, ptn: str, g: int = 0):
|
|
def regexp_extract_all(col: str, ptn: str, g: int = 0):
|
|
|
|
|
+ """提取正则表达式的全部匹配结果。"""
|
|
|
return [e.group(g) for e in re.compile(ptn).finditer(col if col else '')]
|
|
return [e.group(g) for e in re.compile(ptn).finditer(col if col else '')]
|
|
|
|
|
|
|
|
|
|
|
|
|
def add_random_number_prefix(datum: str, separator: str, floor: int, ceiling: int) -> str:
|
|
def add_random_number_prefix(datum: str, separator: str, floor: int, ceiling: int) -> str:
|
|
|
|
|
+ """给字符串追加随机数字前缀。"""
|
|
|
return f'{random.randint(floor, ceiling)}{separator}{datum}'
|
|
return f'{random.randint(floor, ceiling)}{separator}{datum}'
|
|
|
|
|
|
|
|
|
|
|
|
|
def field_merge(delimiter: str, *fields_values):
|
|
def field_merge(delimiter: str, *fields_values):
|
|
|
- """多字段合并,相同仅保留一个,不同用 delimiter 分隔"""
|
|
|
|
|
|
|
+ """合并多个字段值,去重后用指定分隔符拼接。"""
|
|
|
if not fields_values:
|
|
if not fields_values:
|
|
|
return None
|
|
return None
|
|
|
result = []
|
|
result = []
|
|
|
- [result.append(value.strip()) for value in fields_values if value and value.strip() not in result]
|
|
|
|
|
|
|
+ for value in fields_values:
|
|
|
|
|
+ if value and value.strip() not in result:
|
|
|
|
|
+ result.append(value.strip())
|
|
|
return delimiter.join(result)
|
|
return delimiter.join(result)
|
|
|
|
|
|
|
|
|
|
|
|
|
def space2null(text):
|
|
def space2null(text):
|
|
|
|
|
+ """把空白字符串规范化为 None。"""
|
|
|
if text and not text.isspace():
|
|
if text and not text.isspace():
|
|
|
return text
|
|
return text
|
|
|
return None
|
|
return None
|
|
|
|
|
|
|
|
|
|
|
|
|
def merge_ws(text: str):
|
|
def merge_ws(text: str):
|
|
|
|
|
+ """压缩多余空白符,只保留单个空格。"""
|
|
|
if text:
|
|
if text:
|
|
|
return ' '.join(text.split())
|
|
return ' '.join(text.split())
|
|
|
return None
|
|
return None
|
|
|
|
|
|
|
|
|
|
|
|
|
def remove_special_char(text, char):
|
|
def remove_special_char(text, char):
|
|
|
|
|
+ """如果字符串以指定字符结尾,则移除最后一个字符。"""
|
|
|
if text is not None and text.endswith(char):
|
|
if text is not None and text.endswith(char):
|
|
|
return text[:-1]
|
|
return text[:-1]
|
|
|
return text
|
|
return text
|
|
|
|
|
|
|
|
|
|
|
|
|
-@udf(returnType=ArrayType(StringType()))
|
|
|
|
|
-def explode_str_to_arr(text: str) -> list:
|
|
|
|
|
- """大于 8 位时,从后往前每次少一位截取子串入数组(用于前缀匹配场景)"""
|
|
|
|
|
- if text is None:
|
|
|
|
|
- return []
|
|
|
|
|
- if len(text) <= 8:
|
|
|
|
|
- return [text]
|
|
|
|
|
- return [text[:i] for i in range(len(text), 7, -1)]
|
|
|
|
|
-
|
|
|
|
|
-
|
|
|
|
|
def html_unescape(text):
|
|
def html_unescape(text):
|
|
|
|
|
+ """反转义 HTML 实体。"""
|
|
|
return html.unescape(text)
|
|
return html.unescape(text)
|
|
|
|
|
|
|
|
|
|
|
|
|
# ==================== NUMERIC / DATE / HASH ====================
|
|
# ==================== NUMERIC / DATE / HASH ====================
|
|
|
|
|
|
|
|
def max_value(*args):
|
|
def max_value(*args):
|
|
|
|
|
+ """按现有真假值规则返回最大值。"""
|
|
|
maxi_value = None
|
|
maxi_value = None
|
|
|
for elem in args:
|
|
for elem in args:
|
|
|
if not elem:
|
|
if not elem:
|
|
@@ -362,6 +339,7 @@ def max_value(*args):
|
|
|
|
|
|
|
|
|
|
|
|
|
def min_value(*args):
|
|
def min_value(*args):
|
|
|
|
|
+ """按现有真假值规则返回最小值。"""
|
|
|
mini_value = None
|
|
mini_value = None
|
|
|
for elem in args:
|
|
for elem in args:
|
|
|
if not elem:
|
|
if not elem:
|
|
@@ -372,12 +350,14 @@ def min_value(*args):
|
|
|
|
|
|
|
|
|
|
|
|
|
def millis_timestamp_to_str(ts: int, str_format: str = None) -> str:
|
|
def millis_timestamp_to_str(ts: int, str_format: str = None) -> str:
|
|
|
|
|
+ """把毫秒时间戳转换为时间字符串。"""
|
|
|
date_time = datetime.fromtimestamp(ts / 1000.0)
|
|
date_time = datetime.fromtimestamp(ts / 1000.0)
|
|
|
if str_format:
|
|
if str_format:
|
|
|
return date_time.strftime(str_format)
|
|
return date_time.strftime(str_format)
|
|
|
return date_time.strftime('%Y-%m-%d %H:%M:%S.%f')[:-3]
|
|
return date_time.strftime('%Y-%m-%d %H:%M:%S.%f')[:-3]
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
+# UDF-41 时间解析:把日期字符串解析为时间戳。
|
|
|
@udf(returnType=LongType())
|
|
@udf(returnType=LongType())
|
|
|
def parse_datetime_to_timestamp(date_time: str, in_milli_seconds: bool = False, original_format: str = None) -> int:
|
|
def parse_datetime_to_timestamp(date_time: str, in_milli_seconds: bool = False, original_format: str = None) -> int:
|
|
|
"""字符串日期 → 时间戳;支持 YY.MM.DD / YYYY年M月D日 启发式识别"""
|
|
"""字符串日期 → 时间戳;支持 YY.MM.DD / YYYY年M月D日 启发式识别"""
|
|
@@ -406,6 +386,7 @@ def parse_datetime_to_timestamp(date_time: str, in_milli_seconds: bool = False,
|
|
|
return None
|
|
return None
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
+# UDF-42 MD5摘要:把多列值按长度前缀拼接后计算 MD5。
|
|
|
@udf(returnType=StringType())
|
|
@udf(returnType=StringType())
|
|
|
def get_md5(*cols: str) -> str:
|
|
def get_md5(*cols: str) -> str:
|
|
|
"""多列拼接(带长度前缀防碰撞)后取 md5"""
|
|
"""多列拼接(带长度前缀防碰撞)后取 md5"""
|
|
@@ -425,31 +406,39 @@ def get_md5(*cols: str) -> str:
|
|
|
# ==================== CROSS-TYPE CONVERTERS ====================
|
|
# ==================== CROSS-TYPE CONVERTERS ====================
|
|
|
|
|
|
|
|
def array_to_json(arr: List):
|
|
def array_to_json(arr: List):
|
|
|
|
|
+ """把数组序列化为 JSON 字符串。"""
|
|
|
return json.dumps(arr, ensure_ascii=False)
|
|
return json.dumps(arr, ensure_ascii=False)
|
|
|
|
|
|
|
|
|
|
|
|
|
def map_to_json(map: dict):
|
|
def map_to_json(map: dict):
|
|
|
|
|
+ """把字典序列化为 JSON 字符串。"""
|
|
|
return json.dumps(map, ensure_ascii=False)
|
|
return json.dumps(map, ensure_ascii=False)
|
|
|
|
|
|
|
|
|
|
|
|
|
def struct_to_json(struct):
|
|
def struct_to_json(struct):
|
|
|
|
|
+ """把结构体对象转换为 JSON 字符串。"""
|
|
|
json_dict = {key: struct[key] for key in struct.__dict__["__fields__"]}
|
|
json_dict = {key: struct[key] for key in struct.__dict__["__fields__"]}
|
|
|
return json.dumps(json_dict, ensure_ascii=False)
|
|
return json.dumps(json_dict, ensure_ascii=False)
|
|
|
|
|
|
|
|
|
|
|
|
|
def num_to_str(number):
|
|
def num_to_str(number):
|
|
|
|
|
+ """把数值转换成字符串,整数型浮点数去掉小数位。"""
|
|
|
if isinstance(number, float) and number.is_integer():
|
|
if isinstance(number, float) and number.is_integer():
|
|
|
return '{:.0f}'.format(number)
|
|
return '{:.0f}'.format(number)
|
|
|
return str(int(number)) if isinstance(number, int) else str(number)
|
|
return str(int(number)) if isinstance(number, int) else str(number)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
+# UDF-51 字符串转数组:把 JSON array 字符串转换为 Python list。
|
|
|
@udf(returnType=ArrayType(StringType()))
|
|
@udf(returnType=ArrayType(StringType()))
|
|
|
def str_to_arr(json_str: str) -> list:
|
|
def str_to_arr(json_str: str) -> list:
|
|
|
|
|
+ """把 JSON array 字符串转换为 Python list。"""
|
|
|
if json_str:
|
|
if json_str:
|
|
|
- return json.loads(json_str)
|
|
|
|
|
|
|
+ parsed = _load_json_or_default(json_str, default=[])
|
|
|
|
|
+ return parsed if isinstance(parsed, list) else []
|
|
|
return []
|
|
return []
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
+# UDF-52 字符串转JSON字符串数组:把 JSON array 转为 JSON 字符串数组。
|
|
|
@udf(returnType=ArrayType(StringType()))
|
|
@udf(returnType=ArrayType(StringType()))
|
|
|
def str_to_json_arr(json_str):
|
|
def str_to_json_arr(json_str):
|
|
|
"""JSON array 字符串 → list of json strings(每个元素再 json.dumps)"""
|
|
"""JSON array 字符串 → list of json strings(每个元素再 json.dumps)"""
|
|
@@ -463,33 +452,11 @@ def str_to_json_arr(json_str):
|
|
|
return []
|
|
return []
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
+# UDF-53 字符串转Map数组:把 JSON array 字符串转换为 map 数组。
|
|
|
@udf(returnType=ArrayType(MapType(StringType(), StringType())))
|
|
@udf(returnType=ArrayType(MapType(StringType(), StringType())))
|
|
|
def str_to_map_arr(json_str: str) -> list:
|
|
def str_to_map_arr(json_str: str) -> list:
|
|
|
|
|
+ """把 JSON array 字符串转换为 map 数组。"""
|
|
|
if json_str:
|
|
if json_str:
|
|
|
- return json.loads(json_str)
|
|
|
|
|
|
|
+ parsed = _load_json_or_default(json_str, default=[])
|
|
|
|
|
+ return parsed if isinstance(parsed, list) else []
|
|
|
return []
|
|
return []
|
|
|
-
|
|
|
|
|
-
|
|
|
|
|
-@udf(returnType=StringType())
|
|
|
|
|
-def split_str_to_jsonstr(str_list: List):
|
|
|
|
|
- """每个元素按 ':' 切成 k:v,聚合成 JSON 字符串"""
|
|
|
|
|
- res = []
|
|
|
|
|
- for kv_str in str_list:
|
|
|
|
|
- arr = kv_str.split(':')
|
|
|
|
|
- if len(arr) == 2:
|
|
|
|
|
- res.append({arr[0]: arr[1]})
|
|
|
|
|
- return json.dumps(res, ensure_ascii=False)
|
|
|
|
|
-
|
|
|
|
|
-
|
|
|
|
|
-@udf(returnType=MapType(StringType(), ArrayType(StringType())))
|
|
|
|
|
-def split_str_to_maparr(str_list: List):
|
|
|
|
|
- """每个元素按 ':' 切成 k:v,同 key 追加到 list"""
|
|
|
|
|
- res = {}
|
|
|
|
|
- for kv_str in str_list:
|
|
|
|
|
- arr = kv_str.split(':')
|
|
|
|
|
- if len(arr) == 2:
|
|
|
|
|
- if arr[0] not in res:
|
|
|
|
|
- res[arr[0]] = [arr[1]]
|
|
|
|
|
- else:
|
|
|
|
|
- res[arr[0]].append(arr[1])
|
|
|
|
|
- return res
|
|
|