6644493: [Fmt-Da] SimpleDateFormat parsing RFC822 time offset is slow
authorokutsu
Fri, 17 Dec 2010 17:13:43 +0900
changeset 7779 50fa5dbf83e6
parent 7778 f66de3eb25f0
child 7780 fc2905d8201f
6644493: [Fmt-Da] SimpleDateFormat parsing RFC822 time offset is slow Reviewed-by: peytoia
jdk/src/share/classes/java/text/SimpleDateFormat.java
jdk/test/java/text/Format/DateFormat/ISO8601ZoneTest.java
--- a/jdk/src/share/classes/java/text/SimpleDateFormat.java	Fri Dec 17 16:56:13 2010 +0900
+++ b/jdk/src/share/classes/java/text/SimpleDateFormat.java	Fri Dec 17 17:13:43 2010 +0900
@@ -1663,6 +1663,81 @@
     }
 
     /**
+     * Parses numeric forms of time zone offset, such as "hh:mm", and
+     * sets calb to the parsed value.
+     *
+     * @param text  the text to be parsed
+     * @param start the character position to start parsing
+     * @param sign  1: positive; -1: negative
+     * @param count 0: 'Z' or "GMT+hh:mm" parsing; 1 - 3: the number of 'X's
+     * @param colon true - colon required between hh and mm; false - no colon required
+     * @param calb  a CalendarBuilder in which the parsed value is stored
+     * @return updated parsed position, or its negative value to indicate a parsing error
+     */
+    private int subParseNumericZone(String text, int start, int sign, int count,
+                                    boolean colon, CalendarBuilder calb) {
+        int index = start;
+
+      parse:
+        try {
+            char c = text.charAt(index++);
+            // Parse hh
+            int hours;
+            if (!isDigit(c)) {
+                break parse;
+            }
+            hours = c - '0';
+            c = text.charAt(index++);
+            if (isDigit(c)) {
+                hours = hours * 10 + (c - '0');
+            } else {
+                // If no colon in RFC 822 or 'X' (ISO), two digits are
+                // required.
+                if (count > 0 || !colon) {
+                    break parse;
+                }
+                --index;
+            }
+            if (hours > 23) {
+                break parse;
+            }
+            int minutes = 0;
+            if (count != 1) {
+                // Proceed with parsing mm
+                c = text.charAt(index++);
+                if (colon) {
+                    if (c != ':') {
+                        break parse;
+                    }
+                    c = text.charAt(index++);
+                }
+                if (!isDigit(c)) {
+                    break parse;
+                }
+                minutes = c - '0';
+                c = text.charAt(index++);
+                if (!isDigit(c)) {
+                    break parse;
+                }
+                minutes = minutes * 10 + (c - '0');
+                if (minutes > 59) {
+                    break parse;
+                }
+            }
+            minutes += hours * 60;
+            calb.set(Calendar.ZONE_OFFSET, minutes * MILLIS_PER_MINUTE * sign)
+                .set(Calendar.DST_OFFSET, 0);
+            return index;
+        } catch (IndexOutOfBoundsException e) {
+        }
+        return  1 - index; // -(index - 1)
+    }
+
+    private boolean isDigit(char c) {
+        return c >= '0' && c <= '9';
+    }
+
+    /**
      * Private member function that converts the parsed date strings into
      * timeFields. Returns -start (for ParsePosition) if failed.
      * @param text the time text to be parsed.
@@ -1907,248 +1982,95 @@
 
             case PATTERN_ZONE_NAME:  // 'z'
             case PATTERN_ZONE_VALUE: // 'Z'
-                // First try to parse generic forms such as GMT-07:00. Do this first
-                // in case localized TimeZoneNames contains the string "GMT"
-                // for a zone; in that case, we don't want to match the first three
-                // characters of GMT+/-hh:mm etc.
                 {
                     int sign = 0;
-                    int offset;
-
-                    // For time zones that have no known names, look for strings
-                    // of the form:
-                    //    GMT[+-]hours:minutes or
-                    //    GMT.
-                    if ((text.length() - start) >= GMT.length() &&
-                        text.regionMatches(true, start, GMT, 0, GMT.length())) {
-                        int num;
-                        calb.set(Calendar.DST_OFFSET, 0);
-                        pos.index = start + GMT.length();
-
-                        try { // try-catch for "GMT" only time zone string
-                            char c = text.charAt(pos.index);
-                            if (c == '+') {
-                                sign = 1;
-                            } else if (c == '-') {
-                                sign = -1;
-                            }
+                    try {
+                        char c = text.charAt(pos.index);
+                        if (c == '+') {
+                            sign = 1;
+                        } else if (c == '-') {
+                            sign = -1;
                         }
-                        catch(StringIndexOutOfBoundsException e) {}
-
-                        if (sign == 0) {    /* "GMT" without offset */
-                            calb.set(Calendar.ZONE_OFFSET, 0);
-                            return pos.index;
-                        }
+                        if (sign == 0) {
+                            // Try parsing a custom time zone "GMT+hh:mm" or "GMT".
+                            if ((c == 'G' || c == 'g')
+                                && (text.length() - start) >= GMT.length()
+                                && text.regionMatches(true, start, GMT, 0, GMT.length())) {
+                                pos.index = start + GMT.length();
 
-                        // Look for hours.
-                        try {
-                            char c = text.charAt(++pos.index);
-                            if (c < '0' || c > '9') { /* must be from '0' to '9'. */
-                                break parsing;
-                            }
-                            num = c - '0';
-
-                            if (text.charAt(++pos.index) != ':') {
-                                c = text.charAt(pos.index);
-                                if (c < '0' || c > '9') { /* must be from '0' to '9'. */
-                                    break parsing;
+                                if ((text.length() - pos.index) > 0) {
+                                    c = text.charAt(pos.index);
+                                    if (c == '+') {
+                                        sign = 1;
+                                    } else if (c == '-') {
+                                        sign = -1;
+                                    }
                                 }
-                                num *= 10;
-                                num += c - '0';
-                                pos.index++;
-                            }
-                            if (num > 23) {
-                                --pos.index;
-                                break parsing;
-                            }
-                            if  (text.charAt(pos.index) != ':') {
-                                break parsing;
-                            }
 
-                            // Look for minutes.
-                            offset = num * 60;
-                            c = text.charAt(++pos.index);
-                            if (c < '0' || c > '9') { /* must be from '0' to '9'. */
-                                break parsing;
-                            }
-                            num = c - '0';
-                            c = text.charAt(++pos.index);
-                            if (c < '0' || c > '9') { /* must be from '0' to '9'. */
-                                break parsing;
-                            }
-                            num *= 10;
-                            num += c - '0';
+                                if (sign == 0) {    /* "GMT" without offset */
+                                    calb.set(Calendar.ZONE_OFFSET, 0)
+                                        .set(Calendar.DST_OFFSET, 0);
+                                    return pos.index;
+                                }
 
-                            if (num > 59) {
-                                break parsing;
-                            }
-                        } catch (StringIndexOutOfBoundsException e) {
-                            break parsing;
-                        }
-                        offset += num;
-                        // Fall through for final processing below of 'offset' and 'sign'.
-                    } else {
-                        // If the first character is a sign, look for numeric timezones of
-                        // the form [+-]hhmm as specified by RFC 822. Otherwise, check
-                        // for named time zones by looking through the locale data from
-                        // the TimeZoneNames strings.
-                        try {
-                            char c = text.charAt(pos.index);
-                            if (c == '+') {
-                                sign = 1;
-                            } else if (c == '-') {
-                                sign = -1;
-                            } else {
-                                // Try parsing the text as a time zone name (abbr).
-                                int i = subParseZoneString(text, pos.index, calb);
-                                if (i != 0) {
+                                // Parse the rest as "hh:mm"
+                                int i = subParseNumericZone(text, ++pos.index,
+                                                            sign, 0, true, calb);
+                                if (i > 0) {
                                     return i;
                                 }
-                                break parsing;
-                            }
-
-                            // Parse the text as an RFC 822 time zone string. This code is
-                            // actually a little more permissive than RFC 822.  It will
-                            // try to do its best with numbers that aren't strictly 4
-                            // digits long.
-
-                            // Look for hh.
-                            int hours = 0;
-                            c = text.charAt(++pos.index);
-                            if (c < '0' || c > '9') { /* must be from '0' to '9'. */
-                                break parsing;
-                            }
-                            hours = c - '0';
-                            c = text.charAt(++pos.index);
-                            if (c < '0' || c > '9') { /* must be from '0' to '9'. */
-                                break parsing;
-                            }
-                            hours *= 10;
-                            hours += c - '0';
-
-                            if (hours > 23) {
-                                break parsing;
+                                pos.index = -i;
+                            } else {
+                                // Try parsing the text as a time zone
+                                // name or abbreviation.
+                                int i = subParseZoneString(text, pos.index, calb);
+                                if (i > 0) {
+                                    return i;
+                                }
+                                pos.index = -i;
                             }
-
-                            // Look for mm.
-                            int minutes = 0;
-                            c = text.charAt(++pos.index);
-                            if (c < '0' || c > '9') { /* must be from '0' to '9'. */
-                                break parsing;
-                            }
-                            minutes = c - '0';
-                            c = text.charAt(++pos.index);
-                            if (c < '0' || c > '9') { /* must be from '0' to '9'. */
-                                break parsing;
+                        } else {
+                            // Parse the rest as "hhmm" (RFC 822)
+                            int i = subParseNumericZone(text, ++pos.index,
+                                                        sign, 0, false, calb);
+                            if (i > 0) {
+                                return i;
                             }
-                            minutes *= 10;
-                            minutes += c - '0';
-
-                            if (minutes > 59) {
-                                break parsing;
-                            }
-
-                            offset = hours * 60 + minutes;
-                        } catch (StringIndexOutOfBoundsException e) {
-                            break parsing;
+                            pos.index = -i;
                         }
-                    }
-
-                    // Do the final processing for both of the above cases.  We only
-                    // arrive here if the form GMT+/-... or an RFC 822 form was seen.
-                    if (sign != 0) {
-                        offset *= MILLIS_PER_MINUTE * sign;
-                        calb.set(Calendar.ZONE_OFFSET, offset).set(Calendar.DST_OFFSET, 0);
-                        return ++pos.index;
+                    } catch (IndexOutOfBoundsException e) {
                     }
                 }
                 break parsing;
 
             case PATTERN_ISO_ZONE:   // 'X'
                 {
-                    int sign = 0;
-                    int offset = 0;
-
-                    iso8601: {
-                        try {
-                            char c = text.charAt(pos.index);
-                            if (c == 'Z') {
-                                calb.set(Calendar.ZONE_OFFSET, 0).set(Calendar.DST_OFFSET, 0);
-                                return ++pos.index;
-                            }
-
-                            // parse text as "+/-hh[[:]mm]" based on count
-                            if (c == '+') {
-                                sign = 1;
-                            } else if (c == '-') {
-                                sign = -1;
-                            }
-                            // Look for hh.
-                            int hours = 0;
-                            c = text.charAt(++pos.index);
-                            if (c < '0' || c > '9') { /* must be from '0' to '9'. */
-                                break parsing;
-                            }
-                            hours = c - '0';
-                            c = text.charAt(++pos.index);
-                            if (c < '0' || c > '9') { /* must be from '0' to '9'. */
-                                break parsing;
-                            }
-                            hours *= 10;
-                            hours += c - '0';
-                            if (hours > 23) {
-                                break parsing;
-                            }
+                    if ((text.length() - pos.index) <= 0) {
+                        break parsing;
+                    }
 
-                            if (count == 1) { // "X"
-                                offset = hours * 60;
-                                break iso8601;
-                            }
-
-                            c = text.charAt(++pos.index);
-                            // Skip ':' if "XXX"
-                            if (c == ':') {
-                                if (count == 2) {
-                                    break parsing;
-                                }
-                                c = text.charAt(++pos.index);
-                            } else {
-                                if (count == 3) {
-                                    // missing ':'
-                                    break parsing;
-                                }
-                            }
-
-                            // Look for mm.
-                            int minutes = 0;
-                            if (c < '0' || c > '9') { /* must be from '0' to '9'. */
-                                break parsing;
-                            }
-                            minutes = c - '0';
-                            c = text.charAt(++pos.index);
-                            if (c < '0' || c > '9') { /* must be from '0' to '9'. */
-                                break parsing;
-                            }
-                            minutes *= 10;
-                            minutes += c - '0';
-
-                            if (minutes > 59) {
-                                break parsing;
-                            }
-
-                            offset = hours * 60 + minutes;
-                        } catch (StringIndexOutOfBoundsException e) {
-                            break parsing;
-                        }
+                    int sign = 0;
+                    char c = text.charAt(pos.index);
+                    if (c == 'Z') {
+                        calb.set(Calendar.ZONE_OFFSET, 0).set(Calendar.DST_OFFSET, 0);
+                        return ++pos.index;
                     }
 
-                    // Do the final processing for both of the above cases.  We only
-                    // arrive here if the form GMT+/-... or an RFC 822 form was seen.
-                    if (sign != 0) {
-                        offset *= MILLIS_PER_MINUTE * sign;
-                        calb.set(Calendar.ZONE_OFFSET, offset).set(Calendar.DST_OFFSET, 0);
-                        return ++pos.index;
+                    // parse text as "+/-hh[[:]mm]" based on count
+                    if (c == '+') {
+                        sign = 1;
+                    } else if (c == '-') {
+                        sign = -1;
+                    } else {
+                        ++pos.index;
+                        break parsing;
                     }
+                    int i = subParseNumericZone(text, ++pos.index, sign, count,
+                                                count == 3, calb);
+                    if (i > 0) {
+                        return i;
+                    }
+                    pos.index = -i;
                 }
                 break parsing;
 
--- a/jdk/test/java/text/Format/DateFormat/ISO8601ZoneTest.java	Fri Dec 17 16:56:13 2010 +0900
+++ b/jdk/test/java/text/Format/DateFormat/ISO8601ZoneTest.java	Fri Dec 17 17:13:43 2010 +0900
@@ -60,48 +60,51 @@
         "yyyy-MM-dd'T'HH:mm:ss.SSSXXX",
     };
 
+    // badData[][0] - format
+    // badData[][1] - (bad) text to be parsed
+    // badData[][2] - subtext at the end of which a parse error is detected
     static final String[][] badData = {
-        { "X", "1" },
-        { "X", "+1" },
-        { "X", "-2" },
-        { "X", "-24" },
-        { "X", "+24" },
+        { "X", "1", "1" },
+        { "X", "+1", "+1" },
+        { "X", "-2", "-2" },
+        { "X", "-24", "-2" },
+        { "X", "+24", "+2" },
 
-        { "XX", "9" },
-        { "XX", "23" },
-        { "XX", "234" },
-        { "XX", "3456" },
-        { "XX", "23456" },
-        { "XX", "+1" },
-        { "XX", "-12" },
-        { "XX", "+123" },
-        { "XX", "-12:34" },
-        { "XX", "+12:34" },
-        { "XX", "-2423" },
-        { "XX", "+2423" },
-        { "XX", "-1260" },
-        { "XX", "+1260" },
+        { "XX", "9", "9" },
+        { "XX", "23", "2" },
+        { "XX", "234", "2" },
+        { "XX", "3456", "3" },
+        { "XX", "23456", "2" },
+        { "XX", "+1", "+1" },
+        { "XX", "-12", "-12" },
+        { "XX", "+123", "+123" },
+        { "XX", "-12:34", "-12" },
+        { "XX", "+12:34", "+12" },
+        { "XX", "-2423", "-2" },
+        { "XX", "+2423", "+2" },
+        { "XX", "-1260", "-126" },
+        { "XX", "+1260", "+126" },
 
-        { "XXX", "9" },
-        { "XXX", "23" },
-        { "XXX", "234" },
-        { "XXX", "3456" },
-        { "XXX", "23456" },
-        { "XXX", "2:34" },
-        { "XXX", "12:4" },
-        { "XXX", "12:34" },
-        { "XXX", "-1" },
-        { "XXX", "+1" },
-        { "XXX", "-12" },
-        { "XXX", "+12" },
-        { "XXX", "-123" },
-        { "XXX", "+123" },
-        { "XXX", "-1234" },
-        { "XXX", "+1234" },
-        { "XXX", "+24:23" },
-        { "XXX", "+12:60" },
-        { "XXX", "+1:23" },
-        { "XXX", "+12:3" },
+        { "XXX", "9", "9" },
+        { "XXX", "23", "2" },
+        { "XXX", "234", "2" },
+        { "XXX", "3456", "3" },
+        { "XXX", "23456", "2" },
+        { "XXX", "2:34", "2" },
+        { "XXX", "12:4", "1" },
+        { "XXX", "12:34", "1" },
+        { "XXX", "-1", "-1" },
+        { "XXX", "+1", "+1" },
+        { "XXX", "-12", "-12" },
+        { "XXX", "+12", "+12" },
+        { "XXX", "-123", "-12" },
+        { "XXX", "+123", "+12" },
+        { "XXX", "-1234", "-12" },
+        { "XXX", "+1234", "+12" },
+        { "XXX", "+24:23", "+2" },
+        { "XXX", "+12:60", "+12:6" },
+        { "XXX", "+1:23", "+1" },
+        { "XXX", "+12:3", "+12:3" },
     };
 
     static String[] badFormats = {
@@ -110,6 +113,8 @@
 
     public static void main(String[] args) throws Exception {
         TimeZone tz = TimeZone.getDefault();
+        Locale loc = Locale.getDefault();
+        Locale.setDefault(Locale.US);
 
         try {
             for (int i = 0; i < formatData.length; i++) {
@@ -128,7 +133,7 @@
             }
 
             for (String[] d : badData) {
-                badDataParsing(d[0], d[1]);
+                badDataParsing(d[0], d[1], d[2].length());
             }
 
             for (String fmt : badFormats) {
@@ -136,6 +141,7 @@
             }
         } finally {
             TimeZone.setDefault(tz);
+            Locale.setDefault(loc);
         }
 
     }
@@ -188,15 +194,24 @@
     }
 
 
-    static void badDataParsing(String fmt, String text) {
+    static void badDataParsing(String fmt, String text, int expectedErrorIndex) {
+        SimpleDateFormat sdf = new SimpleDateFormat(fmt);
         try {
-            SimpleDateFormat sdf = new SimpleDateFormat(fmt);
             sdf.parse(text);
             throw new RuntimeException("didn't throw an exception: fmt=" + fmt
                                        + ", text=" + text);
         } catch (ParseException e) {
             // OK
         }
+
+        ParsePosition pos = new ParsePosition(0);
+        Date d = sdf.parse(text, pos);
+        int errorIndex = pos.getErrorIndex();
+        if (d != null || errorIndex != expectedErrorIndex) {
+            throw new RuntimeException("Bad error index=" + errorIndex
+                                       + ", expected=" + expectedErrorIndex
+                                       + ", fmt=" + fmt + ", text=" + text);
+        }
     }
 
     static void badFormat(String fmt) {