import re pattern1 = r'(\d+)[- /\']?([A-Za-z\d]+)[- /\'\.]+(\d+ ?\d+)' pattern2 = r'[,-;\']?(\d+)[- /\']?([A-Za-z\d]+)[- /\'\.]+(\d+ ?\d+)' pattern3 = r'[A-Za-z]+, ([A-Za-z]+) (\d+), (\d+)' pattern4 = r'([A-Za-z\d]+) ([A-Za-z\d]+)\.? (\d+)' pattern5 = r'(!|\d+)[- ]+([A-Za-z]+)[ ]?(\d+)' month_dict = {'Agsts': '08', 'Agsutus': '08', 'Agts': '08', 'Agust': '08', 'Agustus': '08', 'Apr': '04', 'April': '04', 'Aprl': '04', 'Aprll': '04', 'Aug': '08', 'August': '08', 'Deaember': '12', 'Dec': '12', 'December': '12', 'Des': '12', 'Desember': '12', 'Feb': '02', 'Febrauri': '02', 'Februari': '02', 'Februaru': '02', 'February': '02', 'Febuari': '02', 'JULI': '07', 'Jan': '01', 'Januari': '01', 'January': '01', 'Jul': '07', 'Juli': '07', 'July': '07', 'Jun': '06', 'June': '06', 'Juni': '06', 'MAy': '05', 'Mar': '03', 'March': '03', 'Maret': '03', 'Mart': '03', 'May': '05', 'Mei': '05', 'Mrt': '03', 'No': '11', 'Nof': '11', 'Nop': '11', 'Nopember': '11', 'Nov': '11', 'November': '11', 'Oct': '10', 'October': '10', 'Okober': '10', 'Okt': '10', 'Okt0ber': '10', 'Oktober': '10', 'Pebruari': '02', 'Sep': '09', 'Sepetember': '09', 'Sept': '09', 'September': '09', 'Septembver': '09', 'agust': '08', 'des': '12', 'desmb': '12', 'juli': '07', 'maret': '03', 'mei': '05', 'november': '11', 'oct': '10', 'oktober': '10' } def get_date(text: str): match1 = re.match(pattern1, text) if match1: day, month, year = match1.groups() return year, month, day match2 = re.match(pattern2, text) if match2: day, month, year = match2.groups() return year, month, day match3 = re.match(pattern3, text) if match3: month, day, year = match3.groups() return year, month, day match4 = re.match(pattern4, text) if match4: day, month, year = match4.groups() return year, month, day match5 = re.match(pattern5, text) if match5: day, month, year = match5.groups() return year, month, day return None, None, None def clean_date_indonesia(text): if text: year, month, day = get_date(text) year = clean_year(year) month = clean_month(month) day = clean_day(day) if year and month and day: return f'{year}-{month}-{day}' else: return None def clean_year(year: str): if year: year = year.replace(' ', '') if len(year) == 1: return f'200{year}' elif len(year) == 2: if year < '30': return f'20{year}' else: return f'19{year}' elif len(year) == 3: if year[0] == '0': return f'2{year}' else: return f'{year[0]}0{year[1:]}' try: year_int = int(year) if year_int > 2024 or year_int <= 1900: return None except ValueError: return None return year else: return None def clean_month(month: str): if month: if len(month) == 1: month = f'0{month}' elif re.match( r'^\d{2}$', month): month = month else: month = month_dict.get(month) try: month_int = int(month) if month_int < 1 or month_int > 12: return None except ValueError: return None return month else: return None def clean_day(day: str): if day: if len(day) == 1: if day in ['!', 'I', '1']: return '01' else: return f'0{day}' elif len(day) == 2: try: day_int = int(day) if day_int < 1 or day_int > 31: return None except ValueError: return None return day else: return None if __name__ == '__main__': test_cases = [ 'Monday, November 03, 2014', 'Tuesday, September 08, 2015', "28 Agustus' 09", "19Juli 2011", '25September 2027', 'I Februari 2011', ',30 April 2013', '25 Agsts. 08', "4'Nov 08", "15 Des.08", "'06-Sept-10", "! Dec 09", "06- Mei 09", "1 Desember2009", "18 MAy09", "22-Jan013", "21 Okober-20 10", "01 Oktober 2 013", "19-No-13", "8 oct 9 ", ] for test_case in test_cases: print(test_case + ' -> ', get_date(test_case)) year_cases = [ '00', '01', '09', '19', '20', '79', '85', '96', '99', '013', '204', '209', '210', '2 013', '20 10', '1028', '2116', '10209', '13', ] for year_case in year_cases: print(year_case + ' -> ' + str(clean_year(year_case))) month_cases = [ '01', '09', '11', '5', '7', 'Agsts', 'January', 'Okt', 'No', '17' ] for month_case in month_cases: print(month_case + ' -> ' + str(clean_month(month_case))) day_cases = [ '01', '09', '11', '5', '7', '!', 'I', '31', '35', '13', '898'] for day_case in day_cases: print(day_case + ' -> ' + str(clean_day(day_case))) print('----------------------------------------------------------------|') for test_case in test_cases: print(test_case + ' -> ', clean_date_indonesia(test_case))