import pytest from typing import Set from dw_base.spark.udf.customs.clean_crawler_data import get_regex_match, clean_germany_company_name @pytest.mark.parametrize("company_name, expected", [ ('Sécheron SA', {'é'}), ('Sécheron SA\\u0022ss',{'é','\\u0022'}), ('GEHS GRÜN ENERGİE HEIZUNG UND SANİTÂR',{'Ü','Â'}) ]) def test_get_regex_match(company_name: str, expected: Set[str]): result = get_regex_match(company_name) assert result == expected @pytest.mark.parametrize("company_name, expected", [ ('Beiersdorf Indústria Comércio', 'Beiersdorf Indústria Comércio'), ('GPS Prüftechnik Rhein/Main GmbH', 'GPS Prüftechnik Rhein/Main GmbH') ]) def test_clean_germany_company_name(company_name: str, expected: str): result = clean_germany_company_name(company_name) assert result == expected