clean_crawler_data_test.py 892 B

12345678910111213141516171819202122
  1. import pytest
  2. from typing import Set
  3. from dw_base.spark.udf.customs.clean_crawler_data import get_regex_match, clean_germany_company_name
  4. @pytest.mark.parametrize("company_name, expected", [
  5. ('Sécheron SA', {'é'}),
  6. ('Sécheron SA\\u0022ss',{'é','\\u0022'}),
  7. ('GEHS GRÜN ENERGİE HEIZUNG UND SANİTÂR',{'Ü','Â'})
  8. ])
  9. def test_get_regex_match(company_name: str, expected: Set[str]):
  10. result = get_regex_match(company_name)
  11. assert result == expected
  12. @pytest.mark.parametrize("company_name, expected", [
  13. ('Beiersdorf Indústria Comércio', 'Beiersdorf Indústria Comércio'),
  14. ('GPS Prüftechnik Rhein/Main GmbH', 'GPS Prüftechnik Rhein/Main GmbH')
  15. ])
  16. def test_clean_germany_company_name(company_name: str, expected: str):
  17. result = clean_germany_company_name(company_name)
  18. assert result == expected