Skip to content

Commit 9c15c4d

Browse files
authored
Fix strptime to handle %% and time zones correctly (#20470)
Fixes (and additional tests) to strptime to fix things that failed in Apache arrow test suite. Fixes #20466 and #20467
1 parent 404a8e9 commit 9c15c4d

File tree

3 files changed

+124
-50
lines changed

3 files changed

+124
-50
lines changed

src/library.js

Lines changed: 67 additions & 50 deletions
Original file line numberDiff line numberDiff line change
@@ -528,7 +528,7 @@ addToLibrary({
528528

529529
var yday = ydayFromDate(date)|0;
530530
{{{ makeSetValue('tmPtr', C_STRUCTS.tm.tm_yday, 'yday', 'i32') }}};
531-
{{{ makeSetValue('tmPtr', C_STRUCTS.tm.tm_gmtoff, '-(date.getTimezoneOffset() * 60)', 'i32') }}};
531+
{{{ makeSetValue('tmPtr', C_STRUCTS.tm.tm_gmtoff, '-(date.getTimezoneOffset() * 60)', '*') }}};
532532

533533
// Attention: DST is in December in South, and some regions don't have DST at all.
534534
var start = new Date(date.getFullYear(), 0, 1);
@@ -711,7 +711,7 @@ addToLibrary({
711711
tm_wday: {{{ makeGetValue('tm', C_STRUCTS.tm.tm_wday, 'i32') }}},
712712
tm_yday: {{{ makeGetValue('tm', C_STRUCTS.tm.tm_yday, 'i32') }}},
713713
tm_isdst: {{{ makeGetValue('tm', C_STRUCTS.tm.tm_isdst, 'i32') }}},
714-
tm_gmtoff: {{{ makeGetValue('tm', C_STRUCTS.tm.tm_gmtoff, 'i32') }}},
714+
tm_gmtoff: {{{ makeGetValue('tm', C_STRUCTS.tm.tm_gmtoff, '*') }}},
715715
tm_zone: tm_zone ? UTF8ToString(tm_zone) : ''
716716
};
717717

@@ -965,64 +965,64 @@ addToLibrary({
965965

966966
// reduce number of matchers
967967
var EQUIVALENT_MATCHERS = {
968-
'%A': '%a',
969-
'%B': '%b',
970-
'%c': '%a %b %d %H:%M:%S %Y',
971-
'%D': '%m\\/%d\\/%y',
972-
'%e': '%d',
973-
'%F': '%Y-%m-%d',
974-
'%h': '%b',
975-
'%R': '%H\\:%M',
976-
'%r': '%I\\:%M\\:%S\\s%p',
977-
'%T': '%H\\:%M\\:%S',
978-
'%x': '%m\\/%d\\/(?:%y|%Y)',
979-
'%X': '%H\\:%M\\:%S'
968+
'A': '%a',
969+
'B': '%b',
970+
'c': '%a %b %d %H:%M:%S %Y',
971+
'D': '%m\\/%d\\/%y',
972+
'e': '%d',
973+
'F': '%Y-%m-%d',
974+
'h': '%b',
975+
'R': '%H\\:%M',
976+
'r': '%I\\:%M\\:%S\\s%p',
977+
'T': '%H\\:%M\\:%S',
978+
'x': '%m\\/%d\\/(?:%y|%Y)',
979+
'X': '%H\\:%M\\:%S'
980980
};
981-
for (var matcher in EQUIVALENT_MATCHERS) {
982-
pattern = pattern.replace(matcher, EQUIVALENT_MATCHERS[matcher]);
983-
}
984-
985981
// TODO: take care of locale
986982

987983
var DATE_PATTERNS = {
988-
/* weeday name */ '%a': '(?:Sun(?:day)?)|(?:Mon(?:day)?)|(?:Tue(?:sday)?)|(?:Wed(?:nesday)?)|(?:Thu(?:rsday)?)|(?:Fri(?:day)?)|(?:Sat(?:urday)?)',
989-
/* month name */ '%b': '(?:Jan(?:uary)?)|(?:Feb(?:ruary)?)|(?:Mar(?:ch)?)|(?:Apr(?:il)?)|May|(?:Jun(?:e)?)|(?:Jul(?:y)?)|(?:Aug(?:ust)?)|(?:Sep(?:tember)?)|(?:Oct(?:ober)?)|(?:Nov(?:ember)?)|(?:Dec(?:ember)?)',
990-
/* century */ '%C': '\\d\\d',
991-
/* day of month */ '%d': '0[1-9]|[1-9](?!\\d)|1\\d|2\\d|30|31',
992-
/* hour (24hr) */ '%H': '\\d(?!\\d)|[0,1]\\d|20|21|22|23',
993-
/* hour (12hr) */ '%I': '\\d(?!\\d)|0\\d|10|11|12',
994-
/* day of year */ '%j': '00[1-9]|0?[1-9](?!\\d)|0?[1-9]\\d(?!\\d)|[1,2]\\d\\d|3[0-6]\\d',
995-
/* month */ '%m': '0[1-9]|[1-9](?!\\d)|10|11|12',
996-
/* minutes */ '%M': '0\\d|\\d(?!\\d)|[1-5]\\d',
997-
/* whitespace */ '%n': '\\s',
998-
/* AM/PM */ '%p': 'AM|am|PM|pm|A\\.M\\.|a\\.m\\.|P\\.M\\.|p\\.m\\.',
999-
/* seconds */ '%S': '0\\d|\\d(?!\\d)|[1-5]\\d|60',
1000-
/* week number */ '%U': '0\\d|\\d(?!\\d)|[1-4]\\d|50|51|52|53',
1001-
/* week number */ '%W': '0\\d|\\d(?!\\d)|[1-4]\\d|50|51|52|53',
1002-
/* weekday number */ '%w': '[0-6]',
1003-
/* 2-digit year */ '%y': '\\d\\d',
1004-
/* 4-digit year */ '%Y': '\\d\\d\\d\\d',
1005-
/* % */ '%%': '%',
1006-
/* whitespace */ '%t': '\\s',
984+
/* weekday name */ 'a': '(?:Sun(?:day)?)|(?:Mon(?:day)?)|(?:Tue(?:sday)?)|(?:Wed(?:nesday)?)|(?:Thu(?:rsday)?)|(?:Fri(?:day)?)|(?:Sat(?:urday)?)',
985+
/* month name */ 'b': '(?:Jan(?:uary)?)|(?:Feb(?:ruary)?)|(?:Mar(?:ch)?)|(?:Apr(?:il)?)|May|(?:Jun(?:e)?)|(?:Jul(?:y)?)|(?:Aug(?:ust)?)|(?:Sep(?:tember)?)|(?:Oct(?:ober)?)|(?:Nov(?:ember)?)|(?:Dec(?:ember)?)',
986+
/* century */ 'C': '\\d\\d',
987+
/* day of month */ 'd': '0[1-9]|[1-9](?!\\d)|1\\d|2\\d|30|31',
988+
/* hour (24hr) */ 'H': '\\d(?!\\d)|[0,1]\\d|20|21|22|23',
989+
/* hour (12hr) */ 'I': '\\d(?!\\d)|0\\d|10|11|12',
990+
/* day of year */ 'j': '00[1-9]|0?[1-9](?!\\d)|0?[1-9]\\d(?!\\d)|[1,2]\\d\\d|3[0-6]\\d',
991+
/* month */ 'm': '0[1-9]|[1-9](?!\\d)|10|11|12',
992+
/* minutes */ 'M': '0\\d|\\d(?!\\d)|[1-5]\\d',
993+
/* whitespace */ 'n': ' ',
994+
/* AM/PM */ 'p': 'AM|am|PM|pm|A\\.M\\.|a\\.m\\.|P\\.M\\.|p\\.m\\.',
995+
/* seconds */ 'S': '0\\d|\\d(?!\\d)|[1-5]\\d|60',
996+
/* week number */ 'U': '0\\d|\\d(?!\\d)|[1-4]\\d|50|51|52|53',
997+
/* week number */ 'W': '0\\d|\\d(?!\\d)|[1-4]\\d|50|51|52|53',
998+
/* weekday number */ 'w': '[0-6]',
999+
/* 2-digit year */ 'y': '\\d\\d',
1000+
/* 4-digit year */ 'Y': '\\d\\d\\d\\d',
1001+
/* whitespace */ 't': ' ',
1002+
/* time zone */ 'z': 'Z|(?:[\\+\\-]\\d\\d:?(?:\\d\\d)?)'
10071003
};
10081004

10091005
var MONTH_NUMBERS = {JAN: 0, FEB: 1, MAR: 2, APR: 3, MAY: 4, JUN: 5, JUL: 6, AUG: 7, SEP: 8, OCT: 9, NOV: 10, DEC: 11};
10101006
var DAY_NUMBERS_SUN_FIRST = {SUN: 0, MON: 1, TUE: 2, WED: 3, THU: 4, FRI: 5, SAT: 6};
10111007
var DAY_NUMBERS_MON_FIRST = {MON: 0, TUE: 1, WED: 2, THU: 3, FRI: 4, SAT: 5, SUN: 6};
10121008

1013-
for (var datePattern in DATE_PATTERNS) {
1014-
pattern = pattern.replace(datePattern, '('+datePattern+DATE_PATTERNS[datePattern]+')');
1015-
}
1016-
1017-
// take care of capturing groups
10181009
var capture = [];
1019-
for (var i=pattern.indexOf('%'); i>=0; i=pattern.indexOf('%')) {
1020-
capture.push(pattern[i+1]);
1021-
pattern = pattern.replace(new RegExp('\\%'+pattern[i+1], 'g'), '');
1022-
}
1010+
var pattern_out = pattern
1011+
.replace(/%(.)/g, (m, c) => EQUIVALENT_MATCHERS[c] || m)
1012+
.replace(/%(.)/g, (_, c) => {
1013+
let pat = DATE_PATTERNS[c];
1014+
if (pat){
1015+
capture.push(c);
1016+
return `(${pat})`;
1017+
} else {
1018+
return c;
1019+
}
1020+
})
1021+
.replace( // any number of space or tab characters match zero or more spaces
1022+
/\s+/g,'\\s*'
1023+
);
10231024

1024-
var matches = new RegExp('^'+pattern, "i").exec(UTF8ToString(buf))
1025-
// out(UTF8ToString(buf)+ ' is matched by '+((new RegExp('^'+pattern)).source)+' into: '+JSON.stringify(matches));
1025+
var matches = new RegExp('^'+pattern_out, "i").exec(UTF8ToString(buf))
10261026

10271027
function initDate() {
10281028
function fixup(value, min, max) {
@@ -1034,7 +1034,8 @@ addToLibrary({
10341034
day: fixup({{{ makeGetValue('tm', C_STRUCTS.tm.tm_mday, 'i32') }}}, 1, 31),
10351035
hour: fixup({{{ makeGetValue('tm', C_STRUCTS.tm.tm_hour, 'i32') }}}, 0, 23),
10361036
min: fixup({{{ makeGetValue('tm', C_STRUCTS.tm.tm_min, 'i32') }}}, 0, 59),
1037-
sec: fixup({{{ makeGetValue('tm', C_STRUCTS.tm.tm_sec, 'i32') }}}, 0, 59)
1037+
sec: fixup({{{ makeGetValue('tm', C_STRUCTS.tm.tm_sec, 'i32') }}}, 0, 59),
1038+
gmtoff: 0
10381039
};
10391040
};
10401041

@@ -1161,6 +1162,20 @@ addToLibrary({
11611162
}
11621163
}
11631164

1165+
// time zone
1166+
if ((value = getMatch('z'))) {
1167+
// GMT offset as either 'Z' or +-HH:MM or +-HH or +-HHMM
1168+
if (value.toLowerCase() === 'z'){
1169+
date.gmtoff = 0;
1170+
} else {
1171+
var match = value.match(/^((?:\-|\+)\d\d):?(\d\d)?/);
1172+
date.gmtoff = match[1] * 3600;
1173+
if (match[2]) {
1174+
date.gmtoff += date.gmtoff >0 ? match[2] * 60 : -match[2] * 60
1175+
}
1176+
}
1177+
}
1178+
11641179
/*
11651180
tm_sec int seconds after the minute 0-61*
11661181
tm_min int minutes after the hour 0-59
@@ -1171,6 +1186,7 @@ addToLibrary({
11711186
tm_wday int days since Sunday 0-6
11721187
tm_yday int days since January 1 0-365
11731188
tm_isdst int Daylight Saving Time flag
1189+
tm_gmtoff long offset from GMT (seconds)
11741190
*/
11751191

11761192
var fullDate = new Date(date.year, date.month, date.day, date.hour, date.min, date.sec, 0);
@@ -1183,7 +1199,8 @@ addToLibrary({
11831199
{{{ makeSetValue('tm', C_STRUCTS.tm.tm_wday, 'fullDate.getDay()', 'i32') }}};
11841200
{{{ makeSetValue('tm', C_STRUCTS.tm.tm_yday, 'arraySum(isLeapYear(fullDate.getFullYear()) ? MONTH_DAYS_LEAP : MONTH_DAYS_REGULAR, fullDate.getMonth()-1)+fullDate.getDate()-1', 'i32') }}};
11851201
{{{ makeSetValue('tm', C_STRUCTS.tm.tm_isdst, '0', 'i32') }}};
1186-
1202+
{{{ makeSetValue('tm', C_STRUCTS.tm.tm_gmtoff, 'date.gmtoff', '*') }}};
1203+
11871204
// we need to convert the matched sequence into an integer array to take care of UTF-8 characters > 0x7F
11881205
// TODO: not sure that intArrayFromString handles all unicode characters correctly
11891206
return buf+intArrayFromString(matches[0]).length-1;

test/core/test_strptime_tm.c

Lines changed: 47 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -61,5 +61,52 @@ int main() {
6161
ReadMonth("november");
6262
ReadMonth("december");
6363

64+
65+
// check that %% is handled correctly for normal strings
66+
strptime("2020-05-01T00:01%z","%Y-%m-%dT%H:%M%%z",&tm);
67+
printf("%d\n",tm.tm_min);
68+
69+
// check that %% is handled correctly even if the letter after it is
70+
// in EQUIVALENT_MATCHERS
71+
strptime("%D2020-05-01T00:01","%%D%Y-%m-%dT%H:%M",&tm);
72+
printf("%d,%d\n",tm.tm_year+1900,tm.tm_min);
73+
74+
75+
// check that EQUIVALENT_MATCHERS works
76+
// %c == %a %b %d %H:%M:%S %Y
77+
strptime("Sun March 31 12:34:56 2345","%c",&tm);
78+
printf("%d,%d,%d,%d,%d,%d\n",tm.tm_year+1900,tm.tm_mon+1,tm.tm_mday,tm.tm_hour,tm.tm_min,tm.tm_sec);
79+
80+
// check that EQUIVALENT_MATCHERS works twice
81+
// 'T': '%H\\:%M\\:%S',
82+
// 'D': '%m\\/%d\\/%y',
83+
strptime("12:34:56 01/02/03","%T %D",&tm);
84+
printf("%d,%d,%d,%d,%d,%d\n",tm.tm_year+1900,tm.tm_mon+1,tm.tm_mday,tm.tm_hour,tm.tm_min,tm.tm_sec);
85+
86+
// check regex special characters don't break things
87+
strptime(".?12:34:56 01/02/03",".?%T %D",&tm);
88+
printf("%d,%d,%d,%d,%d,%d\n",tm.tm_year+1900,tm.tm_mon+1,tm.tm_mday,tm.tm_hour,tm.tm_min,tm.tm_sec);
89+
90+
91+
// check timezone offsets
92+
strptime("2020-05-01T00:00+0100","%Y-%m-%dT%H:%M%z",&tm);
93+
printf("%ld\n",tm.tm_gmtoff); // 3600
94+
95+
strptime("2020-05-01T00:00Z","%Y-%m-%dT%H:%M%z",&tm);
96+
printf("%ld\n",tm.tm_gmtoff); // 0
97+
98+
strptime("2020-05-01T00:00-02:30","%Y-%m-%dT%H:%M%z",&tm);
99+
printf("%ld\n",tm.tm_gmtoff); // -9000
100+
101+
// check that the numbers of spaces in format string are ignored
102+
strptime("12 34 56","%H %M %S",&tm);
103+
printf("%d,%d,%d\n",tm.tm_hour,tm.tm_min,tm.tm_sec);
104+
105+
strptime("123456","%H %M %S",&tm);
106+
printf("%d,%d,%d\n",tm.tm_hour,tm.tm_min,tm.tm_sec);
107+
108+
strptime("12 34 56","%H %M %S",&tm);
109+
printf("%d,%d,%d\n",tm.tm_hour,tm.tm_min,tm.tm_sec);
110+
64111
return 0;
65112
}

test/core/test_strptime_tm.out

Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -16,3 +16,13 @@ oct: 9
1616
nov: 10
1717
november: 10
1818
december: 11
19+
1
20+
2020,1
21+
2345,3,31,12,34,56
22+
2003,1,2,12,34,56
23+
2003,1,2,12,34,56
24+
3600
25+
0
26+
-9000
27+
12,34,56
28+
12,34,56

0 commit comments

Comments
 (0)