| 123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259 |
- import re
- pattern1 = r'(\d+)[- /\']?([A-Za-z\d]+)[- /\'\.]+(\d+ ?\d+)'
- pattern2 = r'[,-;\']?(\d+)[- /\']?([A-Za-z\d]+)[- /\'\.]+(\d+ ?\d+)'
- pattern3 = r'[A-Za-z]+, ([A-Za-z]+) (\d+), (\d+)'
- pattern4 = r'([A-Za-z\d]+) ([A-Za-z\d]+)\.? (\d+)'
- pattern5 = r'(!|\d+)[- ]+([A-Za-z]+)[ ]?(\d+)'
- month_dict = {'Agsts': '08',
- 'Agsutus': '08',
- 'Agts': '08',
- 'Agust': '08',
- 'Agustus': '08',
- 'Apr': '04',
- 'April': '04',
- 'Aprl': '04',
- 'Aprll': '04',
- 'Aug': '08',
- 'August': '08',
- 'Deaember': '12',
- 'Dec': '12',
- 'December': '12',
- 'Des': '12',
- 'Desember': '12',
- 'Feb': '02',
- 'Febrauri': '02',
- 'Februari': '02',
- 'Februaru': '02',
- 'February': '02',
- 'Febuari': '02',
- 'JULI': '07',
- 'Jan': '01',
- 'Januari': '01',
- 'January': '01',
- 'Jul': '07',
- 'Juli': '07',
- 'July': '07',
- 'Jun': '06',
- 'June': '06',
- 'Juni': '06',
- 'MAy': '05',
- 'Mar': '03',
- 'March': '03',
- 'Maret': '03',
- 'Mart': '03',
- 'May': '05',
- 'Mei': '05',
- 'Mrt': '03',
- 'No': '11',
- 'Nof': '11',
- 'Nop': '11',
- 'Nopember': '11',
- 'Nov': '11',
- 'November': '11',
- 'Oct': '10',
- 'October': '10',
- 'Okober': '10',
- 'Okt': '10',
- 'Okt0ber': '10',
- 'Oktober': '10',
- 'Pebruari': '02',
- 'Sep': '09',
- 'Sepetember': '09',
- 'Sept': '09',
- 'September': '09',
- 'Septembver': '09',
- 'agust': '08',
- 'des': '12',
- 'desmb': '12',
- 'juli': '07',
- 'maret': '03',
- 'mei': '05',
- 'november': '11',
- 'oct': '10',
- 'oktober': '10'
- }
- def get_date(text: str):
- match1 = re.match(pattern1, text)
- if match1:
- day, month, year = match1.groups()
- return year, month, day
- match2 = re.match(pattern2, text)
- if match2:
- day, month, year = match2.groups()
- return year, month, day
- match3 = re.match(pattern3, text)
- if match3:
- month, day, year = match3.groups()
- return year, month, day
- match4 = re.match(pattern4, text)
- if match4:
- day, month, year = match4.groups()
- return year, month, day
- match5 = re.match(pattern5, text)
- if match5:
- day, month, year = match5.groups()
- return year, month, day
- return None, None, None
- def clean_date_indonesia(text):
- if text:
- year, month, day = get_date(text)
- year = clean_year(year)
- month = clean_month(month)
- day = clean_day(day)
- if year and month and day:
- return f'{year}-{month}-{day}'
- else:
- return None
- def clean_year(year: str):
- if year:
- year = year.replace(' ', '')
- if len(year) == 1:
- return f'200{year}'
- elif len(year) == 2:
- if year < '30':
- return f'20{year}'
- else:
- return f'19{year}'
- elif len(year) == 3:
- if year[0] == '0':
- return f'2{year}'
- else:
- return f'{year[0]}0{year[1:]}'
- try:
- year_int = int(year)
- if year_int > 2024 or year_int <= 1900:
- return None
- except ValueError:
- return None
- return year
- else:
- return None
- def clean_month(month: str):
- if month:
- if len(month) == 1:
- month = f'0{month}'
- elif re.match( r'^\d{2}$', month):
- month = month
- else:
- month = month_dict.get(month)
- try:
- month_int = int(month)
- if month_int < 1 or month_int > 12:
- return None
- except ValueError:
- return None
- return month
- else:
- return None
- def clean_day(day: str):
- if day:
- if len(day) == 1:
- if day in ['!', 'I', '1']:
- return '01'
- else:
- return f'0{day}'
- elif len(day) == 2:
- try:
- day_int = int(day)
- if day_int < 1 or day_int > 31:
- return None
- except ValueError:
- return None
- return day
- else:
- return None
- if __name__ == '__main__':
- test_cases = [
- 'Monday, November 03, 2014',
- 'Tuesday, September 08, 2015',
- "28 Agustus' 09",
- "19Juli 2011",
- '25September 2027',
- 'I Februari 2011',
- ',30 April 2013',
- '25 Agsts. 08',
- "4'Nov 08",
- "15 Des.08",
- "'06-Sept-10",
- "! Dec 09",
- "06- Mei 09",
- "1 Desember2009",
- "18 MAy09",
- "22-Jan013",
- "21 Okober-20 10",
- "01 Oktober 2 013",
- "19-No-13",
- "8 oct 9 ",
- ]
- for test_case in test_cases:
- print(test_case + ' -> ', get_date(test_case))
- year_cases = [
- '00',
- '01',
- '09',
- '19',
- '20',
- '79',
- '85',
- '96',
- '99',
- '013',
- '204',
- '209',
- '210',
- '2 013',
- '20 10',
- '1028',
- '2116',
- '10209',
- '13',
- ]
- for year_case in year_cases:
- print(year_case + ' -> ' + str(clean_year(year_case)))
- month_cases = [
- '01',
- '09',
- '11',
- '5',
- '7',
- 'Agsts',
- 'January',
- 'Okt',
- 'No',
- '17'
- ]
- for month_case in month_cases:
- print(month_case + ' -> ' + str(clean_month(month_case)))
- day_cases = [
- '01',
- '09',
- '11',
- '5',
- '7',
- '!',
- 'I',
- '31',
- '35',
- '13',
- '898']
- for day_case in day_cases:
- print(day_case + ' -> ' + str(clean_day(day_case)))
- print('----------------------------------------------------------------|')
- for test_case in test_cases:
- print(test_case + ' -> ', clean_date_indonesia(test_case))
|