d2str.py 6.5 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259
  1. import re
  2. pattern1 = r'(\d+)[- /\']?([A-Za-z\d]+)[- /\'\.]+(\d+ ?\d+)'
  3. pattern2 = r'[,-;\']?(\d+)[- /\']?([A-Za-z\d]+)[- /\'\.]+(\d+ ?\d+)'
  4. pattern3 = r'[A-Za-z]+, ([A-Za-z]+) (\d+), (\d+)'
  5. pattern4 = r'([A-Za-z\d]+) ([A-Za-z\d]+)\.? (\d+)'
  6. pattern5 = r'(!|\d+)[- ]+([A-Za-z]+)[ ]?(\d+)'
  7. month_dict = {'Agsts': '08',
  8. 'Agsutus': '08',
  9. 'Agts': '08',
  10. 'Agust': '08',
  11. 'Agustus': '08',
  12. 'Apr': '04',
  13. 'April': '04',
  14. 'Aprl': '04',
  15. 'Aprll': '04',
  16. 'Aug': '08',
  17. 'August': '08',
  18. 'Deaember': '12',
  19. 'Dec': '12',
  20. 'December': '12',
  21. 'Des': '12',
  22. 'Desember': '12',
  23. 'Feb': '02',
  24. 'Febrauri': '02',
  25. 'Februari': '02',
  26. 'Februaru': '02',
  27. 'February': '02',
  28. 'Febuari': '02',
  29. 'JULI': '07',
  30. 'Jan': '01',
  31. 'Januari': '01',
  32. 'January': '01',
  33. 'Jul': '07',
  34. 'Juli': '07',
  35. 'July': '07',
  36. 'Jun': '06',
  37. 'June': '06',
  38. 'Juni': '06',
  39. 'MAy': '05',
  40. 'Mar': '03',
  41. 'March': '03',
  42. 'Maret': '03',
  43. 'Mart': '03',
  44. 'May': '05',
  45. 'Mei': '05',
  46. 'Mrt': '03',
  47. 'No': '11',
  48. 'Nof': '11',
  49. 'Nop': '11',
  50. 'Nopember': '11',
  51. 'Nov': '11',
  52. 'November': '11',
  53. 'Oct': '10',
  54. 'October': '10',
  55. 'Okober': '10',
  56. 'Okt': '10',
  57. 'Okt0ber': '10',
  58. 'Oktober': '10',
  59. 'Pebruari': '02',
  60. 'Sep': '09',
  61. 'Sepetember': '09',
  62. 'Sept': '09',
  63. 'September': '09',
  64. 'Septembver': '09',
  65. 'agust': '08',
  66. 'des': '12',
  67. 'desmb': '12',
  68. 'juli': '07',
  69. 'maret': '03',
  70. 'mei': '05',
  71. 'november': '11',
  72. 'oct': '10',
  73. 'oktober': '10'
  74. }
  75. def get_date(text: str):
  76. match1 = re.match(pattern1, text)
  77. if match1:
  78. day, month, year = match1.groups()
  79. return year, month, day
  80. match2 = re.match(pattern2, text)
  81. if match2:
  82. day, month, year = match2.groups()
  83. return year, month, day
  84. match3 = re.match(pattern3, text)
  85. if match3:
  86. month, day, year = match3.groups()
  87. return year, month, day
  88. match4 = re.match(pattern4, text)
  89. if match4:
  90. day, month, year = match4.groups()
  91. return year, month, day
  92. match5 = re.match(pattern5, text)
  93. if match5:
  94. day, month, year = match5.groups()
  95. return year, month, day
  96. return None, None, None
  97. def clean_date_indonesia(text):
  98. if text:
  99. year, month, day = get_date(text)
  100. year = clean_year(year)
  101. month = clean_month(month)
  102. day = clean_day(day)
  103. if year and month and day:
  104. return f'{year}-{month}-{day}'
  105. else:
  106. return None
  107. def clean_year(year: str):
  108. if year:
  109. year = year.replace(' ', '')
  110. if len(year) == 1:
  111. return f'200{year}'
  112. elif len(year) == 2:
  113. if year < '30':
  114. return f'20{year}'
  115. else:
  116. return f'19{year}'
  117. elif len(year) == 3:
  118. if year[0] == '0':
  119. return f'2{year}'
  120. else:
  121. return f'{year[0]}0{year[1:]}'
  122. try:
  123. year_int = int(year)
  124. if year_int > 2024 or year_int <= 1900:
  125. return None
  126. except ValueError:
  127. return None
  128. return year
  129. else:
  130. return None
  131. def clean_month(month: str):
  132. if month:
  133. if len(month) == 1:
  134. month = f'0{month}'
  135. elif re.match( r'^\d{2}$', month):
  136. month = month
  137. else:
  138. month = month_dict.get(month)
  139. try:
  140. month_int = int(month)
  141. if month_int < 1 or month_int > 12:
  142. return None
  143. except ValueError:
  144. return None
  145. return month
  146. else:
  147. return None
  148. def clean_day(day: str):
  149. if day:
  150. if len(day) == 1:
  151. if day in ['!', 'I', '1']:
  152. return '01'
  153. else:
  154. return f'0{day}'
  155. elif len(day) == 2:
  156. try:
  157. day_int = int(day)
  158. if day_int < 1 or day_int > 31:
  159. return None
  160. except ValueError:
  161. return None
  162. return day
  163. else:
  164. return None
  165. if __name__ == '__main__':
  166. test_cases = [
  167. 'Monday, November 03, 2014',
  168. 'Tuesday, September 08, 2015',
  169. "28 Agustus' 09",
  170. "19Juli 2011",
  171. '25September 2027',
  172. 'I Februari 2011',
  173. ',30 April 2013',
  174. '25 Agsts. 08',
  175. "4'Nov 08",
  176. "15 Des.08",
  177. "'06-Sept-10",
  178. "! Dec 09",
  179. "06- Mei 09",
  180. "1 Desember2009",
  181. "18 MAy09",
  182. "22-Jan013",
  183. "21 Okober-20 10",
  184. "01 Oktober 2 013",
  185. "19-No-13",
  186. "8 oct 9 ",
  187. ]
  188. for test_case in test_cases:
  189. print(test_case + ' -> ', get_date(test_case))
  190. year_cases = [
  191. '00',
  192. '01',
  193. '09',
  194. '19',
  195. '20',
  196. '79',
  197. '85',
  198. '96',
  199. '99',
  200. '013',
  201. '204',
  202. '209',
  203. '210',
  204. '2 013',
  205. '20 10',
  206. '1028',
  207. '2116',
  208. '10209',
  209. '13',
  210. ]
  211. for year_case in year_cases:
  212. print(year_case + ' -> ' + str(clean_year(year_case)))
  213. month_cases = [
  214. '01',
  215. '09',
  216. '11',
  217. '5',
  218. '7',
  219. 'Agsts',
  220. 'January',
  221. 'Okt',
  222. 'No',
  223. '17'
  224. ]
  225. for month_case in month_cases:
  226. print(month_case + ' -> ' + str(clean_month(month_case)))
  227. day_cases = [
  228. '01',
  229. '09',
  230. '11',
  231. '5',
  232. '7',
  233. '!',
  234. 'I',
  235. '31',
  236. '35',
  237. '13',
  238. '898']
  239. for day_case in day_cases:
  240. print(day_case + ' -> ' + str(clean_day(day_case)))
  241. print('----------------------------------------------------------------|')
  242. for test_case in test_cases:
  243. print(test_case + ' -> ', clean_date_indonesia(test_case))