Skip to content

Commit 331fc01

Browse files
miss-islingtonserhiy-storchakaeliben
authored
[3.12] gh-53203: Fix strptime() for %c and %x formats on many locales (GH-124946) (GH-125370)
In some locales (like French or Hebrew) the full or abbreviated names of the default month and weekday used in __calc_date_time can be part of other name or constant part of the %c format. The month name can also match %m with constant suffix (like in Japanese). So the code failed to correctly distinguish formats %a, %A, %b, %B and %m. Cycle all month and all days of the week to find the variable part and distinguish %a from %A and %b from %B or %m. Fixed locales for the following languges: Arabic, Bislama, Breton, Bodo, Kashubian, Chuvash, Estonian, French, Irish, Ge'ez, Gurajati, Manx Gaelic, Hebrew, Hindi, Chhattisgarhi, Haitian Kreyol, Japanese, Kannada, Korean, Marathi, Malay, Norwegian, Nynorsk, Punjabi, Rajasthani, Tok Pisin, Yoruba, Yue Chinese, Yau/Nungon and Chinese. (cherry picked from commit c05f9dd) Co-authored-by: Serhiy Storchaka <[email protected]> Co-authored-by: Eli Bendersky <[email protected]>
1 parent e01a178 commit 331fc01

File tree

3 files changed

+134
-34
lines changed

3 files changed

+134
-34
lines changed

Lib/_strptime.py

Lines changed: 106 additions & 21 deletions
Original file line numberDiff line numberDiff line change
@@ -27,6 +27,18 @@ def _getlang():
2727
# Figure out what the current language is set to.
2828
return locale.getlocale(locale.LC_TIME)
2929

30+
def _findall(haystack, needle):
31+
# Find all positions of needle in haystack.
32+
if not needle:
33+
return
34+
i = 0
35+
while True:
36+
i = haystack.find(needle, i)
37+
if i < 0:
38+
break
39+
yield i
40+
i += len(needle)
41+
3042
class LocaleTime(object):
3143
"""Stores and handles locale-specific information related to time.
3244
@@ -101,7 +113,8 @@ def __calc_am_pm(self):
101113
am_pm = []
102114
for hour in (1, 22):
103115
time_tuple = time.struct_time((1999,3,17,hour,44,55,2,76,0))
104-
am_pm.append(time.strftime("%p", time_tuple).lower())
116+
# br_FR has AM/PM info (' ',' ').
117+
am_pm.append(time.strftime("%p", time_tuple).lower().strip())
105118
self.am_pm = am_pm
106119

107120
def __calc_date_time(self):
@@ -113,42 +126,114 @@ def __calc_date_time(self):
113126
# values within the format string is very important; it eliminates
114127
# possible ambiguity for what something represents.
115128
time_tuple = time.struct_time((1999,3,17,22,44,55,2,76,0))
116-
date_time = [None, None, None]
117-
date_time[0] = time.strftime("%c", time_tuple).lower()
118-
date_time[1] = time.strftime("%x", time_tuple).lower()
119-
date_time[2] = time.strftime("%X", time_tuple).lower()
120-
replacement_pairs = [('%', '%%'), (self.f_weekday[2], '%A'),
121-
(self.f_month[3], '%B'), (self.a_weekday[2], '%a'),
122-
(self.a_month[3], '%b'), (self.am_pm[1], '%p'),
129+
time_tuple2 = time.struct_time((1999,1,3,1,1,1,6,3,0))
130+
replacement_pairs = [
123131
('1999', '%Y'), ('99', '%y'), ('22', '%H'),
124132
('44', '%M'), ('55', '%S'), ('76', '%j'),
125133
('17', '%d'), ('03', '%m'), ('3', '%m'),
126134
# '3' needed for when no leading zero.
127135
('2', '%w'), ('10', '%I')]
128-
replacement_pairs.extend([(tz, "%Z") for tz_values in self.timezone
129-
for tz in tz_values])
130-
for offset,directive in ((0,'%c'), (1,'%x'), (2,'%X')):
131-
current_format = date_time[offset]
132-
for old, new in replacement_pairs:
136+
date_time = []
137+
for directive in ('%c', '%x', '%X'):
138+
current_format = time.strftime(directive, time_tuple).lower()
139+
current_format = current_format.replace('%', '%%')
140+
# The month and the day of the week formats are treated specially
141+
# because of a possible ambiguity in some locales where the full
142+
# and abbreviated names are equal or names of different types
143+
# are equal. See doc of __find_month_format for more details.
144+
lst, fmt = self.__find_weekday_format(directive)
145+
if lst:
146+
current_format = current_format.replace(lst[2], fmt, 1)
147+
lst, fmt = self.__find_month_format(directive)
148+
if lst:
149+
current_format = current_format.replace(lst[3], fmt, 1)
150+
if self.am_pm[1]:
133151
# Must deal with possible lack of locale info
134152
# manifesting itself as the empty string (e.g., Swedish's
135153
# lack of AM/PM info) or a platform returning a tuple of empty
136154
# strings (e.g., MacOS 9 having timezone as ('','')).
137-
if old:
138-
current_format = current_format.replace(old, new)
155+
current_format = current_format.replace(self.am_pm[1], '%p')
156+
for tz_values in self.timezone:
157+
for tz in tz_values:
158+
if tz:
159+
current_format = current_format.replace(tz, "%Z")
160+
for old, new in replacement_pairs:
161+
current_format = current_format.replace(old, new)
139162
# If %W is used, then Sunday, 2005-01-03 will fall on week 0 since
140163
# 2005-01-03 occurs before the first Monday of the year. Otherwise
141164
# %U is used.
142-
time_tuple = time.struct_time((1999,1,3,1,1,1,6,3,0))
143-
if '00' in time.strftime(directive, time_tuple):
165+
if '00' in time.strftime(directive, time_tuple2):
144166
U_W = '%W'
145167
else:
146168
U_W = '%U'
147-
date_time[offset] = current_format.replace('11', U_W)
169+
current_format = current_format.replace('11', U_W)
170+
date_time.append(current_format)
148171
self.LC_date_time = date_time[0]
149172
self.LC_date = date_time[1]
150173
self.LC_time = date_time[2]
151174

175+
def __find_month_format(self, directive):
176+
"""Find the month format appropriate for the current locale.
177+
178+
In some locales (for example French and Hebrew), the default month
179+
used in __calc_date_time has the same name in full and abbreviated
180+
form. Also, the month name can by accident match other part of the
181+
representation: the day of the week name (for example in Morisyen)
182+
or the month number (for example in Japanese). Thus, cycle months
183+
of the year and find all positions that match the month name for
184+
each month, If no common positions are found, the representation
185+
does not use the month name.
186+
"""
187+
full_indices = abbr_indices = None
188+
for m in range(1, 13):
189+
time_tuple = time.struct_time((1999, m, 17, 22, 44, 55, 2, 76, 0))
190+
datetime = time.strftime(directive, time_tuple).lower()
191+
indices = set(_findall(datetime, self.f_month[m]))
192+
if full_indices is None:
193+
full_indices = indices
194+
else:
195+
full_indices &= indices
196+
indices = set(_findall(datetime, self.a_month[m]))
197+
if abbr_indices is None:
198+
abbr_indices = indices
199+
else:
200+
abbr_indices &= indices
201+
if not full_indices and not abbr_indices:
202+
return None, None
203+
if full_indices:
204+
return self.f_month, '%B'
205+
if abbr_indices:
206+
return self.a_month, '%b'
207+
return None, None
208+
209+
def __find_weekday_format(self, directive):
210+
"""Find the day of the week format appropriate for the current locale.
211+
212+
Similar to __find_month_format().
213+
"""
214+
full_indices = abbr_indices = None
215+
for wd in range(7):
216+
time_tuple = time.struct_time((1999, 3, 17, 22, 44, 55, wd, 76, 0))
217+
datetime = time.strftime(directive, time_tuple).lower()
218+
indices = set(_findall(datetime, self.f_weekday[wd]))
219+
if full_indices is None:
220+
full_indices = indices
221+
else:
222+
full_indices &= indices
223+
if self.f_weekday[wd] != self.a_weekday[wd]:
224+
indices = set(_findall(datetime, self.a_weekday[wd]))
225+
if abbr_indices is None:
226+
abbr_indices = indices
227+
else:
228+
abbr_indices &= indices
229+
if not full_indices and not abbr_indices:
230+
return None, None
231+
if full_indices:
232+
return self.f_weekday, '%A'
233+
if abbr_indices:
234+
return self.a_weekday, '%a'
235+
return None, None
236+
152237
def __calc_timezone(self):
153238
# Set self.timezone by using time.tzname.
154239
# Do not worry about possibility of time.tzname[0] == time.tzname[1]
@@ -186,7 +271,7 @@ def __init__(self, locale_time=None):
186271
'd': r"(?P<d>3[0-1]|[1-2]\d|0[1-9]|[1-9]| [1-9])",
187272
'f': r"(?P<f>[0-9]{1,6})",
188273
'H': r"(?P<H>2[0-3]|[0-1]\d|\d)",
189-
'I': r"(?P<I>1[0-2]|0[1-9]|[1-9])",
274+
'I': r"(?P<I>1[0-2]|0[1-9]|[1-9]| [1-9])",
190275
'G': r"(?P<G>\d\d\d\d)",
191276
'j': r"(?P<j>36[0-6]|3[0-5]\d|[1-2]\d\d|0[1-9]\d|00[1-9]|[1-9]\d|0[1-9]|[1-9])",
192277
'm': r"(?P<m>1[0-2]|0[1-9]|[1-9])",
@@ -330,8 +415,8 @@ def _strptime(data_string, format="%a %b %d %H:%M:%S %Y"):
330415
_regex_cache[format] = format_regex
331416
found = format_regex.match(data_string)
332417
if not found:
333-
raise ValueError("time data %r does not match format %r" %
334-
(data_string, format))
418+
raise ValueError("time data %r does not match format %r :: /%s/" %
419+
(data_string, format, format_regex.pattern))
335420
if len(data_string) != found.end():
336421
raise ValueError("unconverted data remains: %s" %
337422
data_string[found.end():])

Lib/test/test_strptime.py

Lines changed: 23 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -5,13 +5,21 @@
55
import locale
66
import re
77
import os
8+
import platform
89
import sys
910
from test import support
1011
from test.support import skip_if_buggy_ucrt_strfptime, run_with_locales
1112
from datetime import date as datetime_date
1213

1314
import _strptime
1415

16+
libc_ver = platform.libc_ver()
17+
if libc_ver[0] == 'glibc':
18+
glibc_ver = tuple(map(int, libc_ver[1].split('.')))
19+
else:
20+
glibc_ver = None
21+
22+
1523
class getlang_Tests(unittest.TestCase):
1624
"""Test _getlang"""
1725
def test_basic(self):
@@ -476,16 +484,16 @@ def test_bad_timezone(self):
476484
# * Year is not included: ha_NG.
477485
# * Use non-Gregorian calendar: lo_LA, thai, th_TH.
478486
#
479-
# BUG: Generates invalid regexp for br_FR, csb_PL, Arabic.
480-
# BUG: Generates regexp that does not match the current date and time
481-
# for fa_IR, gez_ER, gez_ET, lzh_TW, my_MM, or_IN, shn_MM, yo_NG.
482487
# BUG: Generates regexp that does not match the current date and time
483-
# for fa_IR, gez_ER, gez_ET, lzh_TW, my_MM, or_IN, shn_MM, yo_NG,
484-
# fr_FR, ja_JP, he_IL, ko_KR, zh_CN, etc.
485-
@run_with_locales('LC_TIME', 'C', 'en_US', 'de_DE',
486-
'eu_ES', 'mfe_MU')
488+
# for az_IR, fa_IR, lzh_TW, my_MM, or_IN, shn_MM.
489+
@run_with_locales('LC_TIME', 'C', 'en_US', 'fr_FR', 'de_DE', 'ja_JP',
490+
'he_IL', 'eu_ES', 'ar_AE', 'mfe_MU', 'yo_NG',
491+
'csb_PL', 'br_FR', 'gez_ET', 'brx_IN')
487492
def test_date_time_locale(self):
488493
# Test %c directive
494+
loc = locale.getlocale(locale.LC_TIME)[0]
495+
if glibc_ver and glibc_ver < (2, 31) and loc == 'br_FR':
496+
self.skipTest('%c in locale br_FR does not include time')
489497
now = time.time()
490498
self.roundtrip('%c', slice(0, 6), time.localtime(now))
491499
# 1 hour 20 minutes 30 seconds ago
@@ -503,18 +511,19 @@ def test_date_time_locale(self):
503511

504512
# NB: Dates before 1969 do not roundtrip on some locales:
505513
# bo_CN, bo_IN, dz_BT, eu_ES, eu_FR.
506-
@run_with_locales('LC_TIME', 'C', 'en_US', 'de_DE', 'ja_JP')
514+
@run_with_locales('LC_TIME', 'C', 'en_US', 'fr_FR', 'de_DE', 'ja_JP',
515+
'he_IL', 'ar_AE', 'mfe_MU', 'yo_NG',
516+
'csb_PL', 'br_FR', 'gez_ET', 'brx_IN')
507517
def test_date_time_locale2(self):
508518
# Test %c directive
509519
self.roundtrip('%c', slice(0, 6), (1900, 1, 1, 0, 0, 0, 0, 1, 0))
510520

511521
# NB: Does not roundtrip because use non-Gregorian calendar:
512522
# lo_LA, thai, th_TH.
513523
# BUG: Generates regexp that does not match the current date
514-
# for az_IR, fa_IR, lzh_TW, my_MM, or_IN, shn_MM,
515-
# Arabic, ja_JP, ko_KR, zh_CN, etc.
516-
@run_with_locales('LC_TIME', 'C', 'en_US', 'fr_FR', 'de_DE',
517-
'he_IL', 'eu_ES')
524+
# for az_IR, fa_IR, lzh_TW, my_MM, or_IN, shn_MM.
525+
@run_with_locales('LC_TIME', 'C', 'en_US', 'fr_FR', 'de_DE', 'ja_JP',
526+
'he_IL', 'eu_ES', 'ar_AE')
518527
def test_date_locale(self):
519528
# Test %x directive
520529
now = time.time()
@@ -533,7 +542,8 @@ def test_date_locale(self):
533542
support.is_emscripten or support.is_wasi,
534543
"musl libc issue on Emscripten, bpo-46390"
535544
)
536-
@run_with_locales('LC_TIME', 'en_US', 'fr_FR', 'de_DE', 'ja_JP')
545+
@run_with_locales('LC_TIME', 'en_US', 'fr_FR', 'de_DE', 'ja_JP',
546+
'eu_ES', 'ar_AE')
537547
def test_date_locale2(self):
538548
# Test %x directive
539549
self.roundtrip('%x', slice(0, 3), (1900, 1, 1, 0, 0, 0, 0, 1, 0))
Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,5 @@
1+
Fix :func:`time.strptime` for ``%c`` and ``%x`` formats in many locales:
2+
Arabic, Bislama, Breton, Bodo, Kashubian, Chuvash, Estonian, French, Irish,
3+
Ge'ez, Gurajati, Manx Gaelic, Hebrew, Hindi, Chhattisgarhi, Haitian Kreyol,
4+
Japanese, Kannada, Korean, Marathi, Malay, Norwegian, Nynorsk, Punjabi,
5+
Rajasthani, Tok Pisin, Yoruba, Yue Chinese, Yau/Nungon and Chinese.

0 commit comments

Comments
 (0)