import json from datetime import datetime from dw_base.udf.common import spark_common_udf as udf_module def test_json_object_keys_returns_keys_for_json_object(): assert udf_module.json_object_keys.func('{"a": 1, "b": 2}') == ["a", "b"] def test_json_array_subset_supports_python_literal_without_eval(): data = "[{'name': 'alice', 'age': 18}, {'name': 'bob', 'age': 20}]" result = udf_module.json_array_subset(data, "name", as_list=True) assert json.loads(result) == ["alice", "bob"] def test_json_array_subset_returns_none_for_invalid_input(): assert udf_module.json_array_subset("not-json", "name") is None def test_append_to_json_array_returns_original_when_source_is_invalid_json(): assert udf_module.append_to_json_array("not-json", "x") == "not-json" def test_append_to_json_array_can_remove_duplicates(): result = udf_module.append_to_json_array('["a", "b"]', "a", remove_duplicate=True) assert json.loads(result) == ["a", "b"] def test_flatten_json_returns_original_text_for_invalid_json(): assert udf_module.flatten_json("not-json") == "not-json" def test_remove_empty_key_removes_empty_values_recursively(): source = json.dumps({ "a": "", "b": None, "c": {"d": "", "e": 1}, "f": ["", {"g": "x"}], }) assert json.loads(udf_module.remove_empty_key(source)) == {"c": {"e": "1"}, "f": [{"g": "x"}]} def test_merge_list_keeps_existing_semantics(): merged_list = sorted(udf_module.merge_list.func([["a", "", None], ["b", "a"], None])) assert merged_list == ["a", "b"] def test_array_intersect_returns_common_items(): assert sorted(udf_module.array_intersect.func(["a", "b"], ["b", "c"])) == ["b"] def test_array_append_respects_existing_semantics(): assert udf_module.array_append(["a"], "a", remove_duplicate=True) == ["a"] assert udf_module.array_append(["b"], "a", need_sort=True) == ["a", "b"] def test_array_slice_returns_sub_list(): assert udf_module.array_slice.func(["a", "b", "c"], 1, 3) == ["b", "c"] def test_has_chinese_detects_chinese_characters(): assert udf_module.has_chinese.func("abc中文") is True assert udf_module.has_chinese.func("abc") is False def test_similarity_returns_high_score_for_identical_strings(): assert udf_module.similarity.func("abc", "abc") == 1.0 def test_regexp_extract_all_extracts_all_matches(): assert udf_module.regexp_extract_all.func("a1b22c333", r"\d+") == ["1", "22", "333"] def test_field_merge_deduplicates_values(): assert udf_module.field_merge(",", " a ", "b", "a", None) == "a,b" def test_space2null_and_merge_ws_and_remove_special_char(): assert udf_module.space2null(" ") is None assert udf_module.space2null(" a ") == " a " assert udf_module.merge_ws("a b\tc") == "a b c" assert udf_module.remove_special_char("abc,", ",") == "abc" def test_html_unescape_restores_html_entities(): assert udf_module.html_unescape("<div>Tom & Jerry</div>") == "