ent_clean_text_test.py 5.6 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156
  1. import pytest
  2. from dw_base.spark.udf.enterprise.ent_clean_text import *
  3. @pytest.mark.parametrize("country, url, expected", [
  4. ('China', 'https://www.ianshaw.biz/p/contact-management.php', 'ianshaw.biz/p/contact-management.php')
  5. ])
  6. def test_clean_url(country, url, expected):
  7. result = clean_url(country, url)
  8. assert result == expected
  9. @pytest.mark.parametrize("url, expected", [
  10. ('https://charnleyfertilisers.co.uk/', 'charnleyfertilisers.co.uk')
  11. ])
  12. def test_clean_url_common(url, expected):
  13. result = clean_url_common(url)
  14. assert result == expected
  15. @pytest.mark.parametrize("str, expected", [
  16. ('13.02.2024', '2024-02-13'),
  17. ('13/02/2024', '2024-02-13'),
  18. ('', None),
  19. (None, None)
  20. ])
  21. def test_reverse_str(str, expected):
  22. result = reverse_str(str)
  23. assert result == expected
  24. @pytest.mark.parametrize("str, expected", [
  25. ('13a02b20 24', '13022024'),
  26. ])
  27. def test_replace_english_and_space(str, expected):
  28. result = replace_english_and_space(str)
  29. assert result == expected
  30. @pytest.mark.parametrize("str, expected", [
  31. ('981617611,981617611,981617611', '981617611'),
  32. ('', None),
  33. (None, None)
  34. ])
  35. def test_array_remove_duplicates(str, expected):
  36. result = array_remove_duplicates(str)
  37. assert result == expected
  38. @pytest.mark.parametrize("str, expected", [
  39. ('12.575.462 ALVARO PEREIRA DA SILVEIRA FILHO', 'ALVARO PEREIRA DA SILVEIRA FILHO'),
  40. ('HEBE DE ABREU VILELA CPF 027116806149', 'HEBE DE ABREU VILELA CPF'),
  41. ('HEBE DE ABREU VILELA CPF 027116', 'HEBE DE ABREU VILELA CPF 027116'),
  42. ('', None),
  43. (None, None)
  44. ])
  45. def test_clean_brazil_company_name(str, expected):
  46. result = clean_brazil_company_name(str)
  47. assert result == expected
  48. @pytest.mark.parametrize("str, expected", [
  49. ('2124859522,2124859523', '2124859522,2124859523'),
  50. ('123456,2124859523', '2124859523'),
  51. ('123456789,12345678', None),
  52. ('', None),
  53. (None, None)
  54. ])
  55. def test_phone_clean_turkey(str, expected):
  56. result = phone_clean_turkey(str)
  57. assert result == expected
  58. @pytest.mark.parametrize("str, expected", [
  59. ('91234567891', None),
  60. ('01234567891', '1234567891'),
  61. ('123456789012', None),
  62. ('21', None),
  63. ('', None),
  64. (None, None)
  65. ])
  66. def test_fax_clean_turkey(str, expected):
  67. result = fax_clean_turkey(str)
  68. assert result == expected
  69. @pytest.mark.parametrize("str, expected", [
  70. ('["10.71.01-Taze pastane ürünleri imalatı (yaş pasta, kuru pasta, poğaça, kek, börek, pay, turta, waffles vb.)"]', '107101'),
  71. ('["35.12.13-Elektrik enerjisinin iletimi (elektrik üretim kaynağından dağıtım sistemine aktaran iletim sistemlerinin işletilmesi)","42.22.02-Enerji santralleri inşaatı (hidroelektrik santrali, termik santral, nükleer enerji üretim santralleri vb.)","35.11.19-Elektrik enerjisi üretimi"]', '351213, 422202, 351119'),
  72. ('', None),
  73. (None, None)
  74. ])
  75. def test_turkey_nicecode(str, expected):
  76. result = turkey_nicecode(str)
  77. assert result == expected
  78. @pytest.mark.parametrize("str, expected", [
  79. ('HUGO.SANSIL@GMAIL.COM E HUGO@SISTEMAFIEG.ORG.BR', ['hugo.sansil@gmail.com', 'hugo@sistemafieg.org.br']),
  80. ('SANDRA_MMC@BOL.COM.BR NAIRMOTADIAS@HOTMAIL.COM', ['sandra_mmc@bol.com.br', 'nairmotadias@hotmail.com']),
  81. ('', None),
  82. (None, None)
  83. ])
  84. def test_email_clean_brazil(str, expected):
  85. result = email_clean_brazil(str)
  86. assert result == expected
  87. @pytest.mark.parametrize("str, expected", [
  88. ('["medical practice","medical practices","hospital & health care"]', 'medical practice,medical practices,hospital & health care'),
  89. ('["construction"]', 'construction'),
  90. ('', None),
  91. (None, None)
  92. ])
  93. def test_arr_str_to_str(str, expected):
  94. result = arr_str_to_str(str)
  95. assert result == expected
  96. @pytest.mark.parametrize("str, expected", [
  97. ('+1-866-344-7857 ext. 311', '+1 866 344 7857 311'),
  98. ('(844)800-BULL', None),
  99. ('', None),
  100. (None, None)
  101. ])
  102. def test_clean_tel_apollo(str, expected):
  103. result = clean_tel_apollo(str)
  104. assert result == expected
  105. @pytest.mark.parametrize("socialtype, url, expected", [
  106. ("youtube", "https://youtube.com/user/BrotherCanadaEn", 'user/brothercanadaen'),
  107. ("whatsapp", "919822025525", '919822025525'),
  108. ("twitter", "https://twitter.com/#", ''),
  109. ("linkedin", "https://www.linkedin.com/in/meb-jsc/#", 'in/meb-jsc'),
  110. ("instagram", "https://www.instagram.com/##############/", ''),
  111. ("facebook", "https://www.facebook.com/https://www.facebook.com/komlider38/", 'komlider38'),
  112. ("pinterest", "https://www.pinterest.com/lampstore/https://www.pinterest.com/lampstore/", 'lampstore'),
  113. (None, "",None),
  114. ("", None,None),
  115. (None, None,None)
  116. ])
  117. def test_socialmedia_url(socialtype, url, expected):
  118. result = socialmedia_url(socialtype, url)
  119. assert result == expected
  120. @pytest.mark.parametrize("str, expected", [
  121. ('-- PACIFIC PRODUCTS LIMITED AUSTRALIAN PRODUCTS LIMITED', 'PACIFIC PRODUCTS LIMITED AUSTRALIAN PRODUCTS LIMITED'),
  122. ('03-MAY-2013 Fuente Union Import And Export Limited 福恩特聯合進出口有限公司', 'Fuente Union Import And Export Limited 福恩特聯合進出口有限公司'),
  123. ('', None),
  124. (None, None)
  125. ])
  126. def test_hongkong_previous_name_clean(str, expected):
  127. result = hongkong_previous_name_clean(str)
  128. assert result == expected
  129. @pytest.mark.parametrize("str, expected", [
  130. ('["ownership-of-shares-25-to-50-percent","voting-rights-25-to-50-percent"]', '25-to-50'),
  131. ('["ownership-of-shares-more-than-25-percent-registered-overseas-entity"]', 'more-than-25'),
  132. ('', None),
  133. (None, None)
  134. ])
  135. def test_uk_sharepercent(str, expected):
  136. result = uk_sharepercent(str)
  137. assert result == expected
  138. if __name__ == '__main__':
  139. pytest.main()