jaxws/src/java.xml.soap/share/classes/com/sun/xml/internal/messaging/saaj/packaging/mime/internet/HeaderTokenizer.java
changeset 28644 a70f5680dbab
parent 28643 a665e19ca007
parent 28642 a42fefc69922
child 28647 f44908f03772
equal deleted inserted replaced
28643:a665e19ca007 28644:a70f5680dbab
     1 /*
       
     2  * Copyright (c) 1997, 2013, Oracle and/or its affiliates. All rights reserved.
       
     3  * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
       
     4  *
       
     5  * This code is free software; you can redistribute it and/or modify it
       
     6  * under the terms of the GNU General Public License version 2 only, as
       
     7  * published by the Free Software Foundation.  Oracle designates this
       
     8  * particular file as subject to the "Classpath" exception as provided
       
     9  * by Oracle in the LICENSE file that accompanied this code.
       
    10  *
       
    11  * This code is distributed in the hope that it will be useful, but WITHOUT
       
    12  * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
       
    13  * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
       
    14  * version 2 for more details (a copy is included in the LICENSE file that
       
    15  * accompanied this code).
       
    16  *
       
    17  * You should have received a copy of the GNU General Public License version
       
    18  * 2 along with this work; if not, write to the Free Software Foundation,
       
    19  * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
       
    20  *
       
    21  * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
       
    22  * or visit www.oracle.com if you need additional information or have any
       
    23  * questions.
       
    24  */
       
    25 
       
    26 /*
       
    27  * @(#)HeaderTokenizer.java   1.9 02/03/27
       
    28  */
       
    29 
       
    30 
       
    31 
       
    32 package com.sun.xml.internal.messaging.saaj.packaging.mime.internet;
       
    33 
       
    34 
       
    35 /**
       
    36  * This class tokenizes RFC822 and MIME headers into the basic
       
    37  * symbols specified by RFC822 and MIME. <p>
       
    38  *
       
    39  * This class handles folded headers (ie headers with embedded
       
    40  * CRLF SPACE sequences). The folds are removed in the returned
       
    41  * tokens.
       
    42  *
       
    43  * @version 1.9, 02/03/27
       
    44  * @author  John Mani
       
    45  */
       
    46 
       
    47 public class HeaderTokenizer {
       
    48 
       
    49     /**
       
    50      * The Token class represents tokens returned by the
       
    51      * HeaderTokenizer.
       
    52      */
       
    53     public static class Token {
       
    54 
       
    55         private int type;
       
    56         private String value;
       
    57 
       
    58         /**
       
    59          * Token type indicating an ATOM.
       
    60          */
       
    61         public static final int ATOM            = -1;
       
    62 
       
    63         /**
       
    64          * Token type indicating a quoted string. The value
       
    65          * field contains the string without the quotes.
       
    66          */
       
    67         public static final int QUOTEDSTRING    = -2;
       
    68 
       
    69         /**
       
    70          * Token type indicating a comment. The value field
       
    71          * contains the comment string without the comment
       
    72          * start and end symbols.
       
    73          */
       
    74         public static final int COMMENT         = -3;
       
    75 
       
    76         /**
       
    77          * Token type indicating end of input.
       
    78          */
       
    79         public static final int  EOF            = -4;
       
    80 
       
    81         /**
       
    82          * Constructor.
       
    83          * @param       type    Token type
       
    84          * @param       value   Token value
       
    85          */
       
    86         public Token(int type, String value) {
       
    87              this.type = type;
       
    88              this.value = value;
       
    89         }
       
    90 
       
    91         /**
       
    92          * Return the type of the token. If the token represents a
       
    93          * delimiter or a control character, the type is that character
       
    94          * itself, converted to an integer. Otherwise, it's value is
       
    95          * one of the following:
       
    96          * <ul>
       
    97          * <li><code>ATOM</code> A sequence of ASCII characters
       
    98          *      delimited by either SPACE, CTL, "(", <"> or the
       
    99          *      specified SPECIALS
       
   100          * <li><code>QUOTEDSTRING</code> A sequence of ASCII characters
       
   101          *      within quotes
       
   102          * <li><code>COMMENT</code> A sequence of ASCII characters
       
   103          *      within "(" and ")".
       
   104          * <li><code>EOF</code> End of header
       
   105          * </ul>
       
   106          */
       
   107         public int getType() {
       
   108             return type;
       
   109         }
       
   110 
       
   111         /**
       
   112          * Returns the value of the token just read. When the current
       
   113          * token is a quoted string, this field contains the body of the
       
   114          * string, without the quotes. When the current token is a comment,
       
   115          * this field contains the body of the comment.
       
   116          *
       
   117          * @return      token value
       
   118          */
       
   119         public String getValue() {
       
   120             return value;
       
   121         }
       
   122     }
       
   123 
       
   124     private String string; // the string to be tokenized
       
   125     private boolean skipComments; // should comments be skipped ?
       
   126     private String delimiters; // delimiter string
       
   127     private int currentPos; // current parse position
       
   128     private int maxPos; // string length
       
   129     private int nextPos; // track start of next Token for next()
       
   130     private int peekPos; // track start of next Token for peek()
       
   131 
       
   132     /**
       
   133      * RFC822 specials
       
   134      */
       
   135     public final static String RFC822 = "()<>@,;:\\\"\t .[]";
       
   136 
       
   137     /**
       
   138      * MIME specials
       
   139      */
       
   140     public final static String MIME = "()<>@,;:\\\"\t []/?=";
       
   141 
       
   142     // The EOF Token
       
   143     private final static Token EOFToken = new Token(Token.EOF, null);
       
   144 
       
   145     /**
       
   146      * Constructor that takes a rfc822 style header.
       
   147      *
       
   148      * @param   header  The rfc822 header to be tokenized
       
   149      * @param   delimiters      Set of delimiter characters
       
   150      *                          to be used to delimit ATOMS. These
       
   151      *                          are usually <code>RFC822</code> or
       
   152      *                          <code>MIME</code>
       
   153      * @param   skipComments  If true, comments are skipped and
       
   154      *                          not returned as tokens
       
   155      */
       
   156     public HeaderTokenizer(String header, String delimiters,
       
   157                            boolean skipComments) {
       
   158         string = (header == null) ? "" : header; // paranoia ?!
       
   159         this.skipComments = skipComments;
       
   160         this.delimiters = delimiters;
       
   161         currentPos = nextPos = peekPos = 0;
       
   162         maxPos = string.length();
       
   163     }
       
   164 
       
   165     /**
       
   166      * Constructor. Comments are ignored and not returned as tokens
       
   167      *
       
   168      * @param   header  The header that is tokenized
       
   169      * @param   delimiters  The delimiters to be used
       
   170      */
       
   171     public HeaderTokenizer(String header, String delimiters) {
       
   172         this(header, delimiters, true);
       
   173     }
       
   174 
       
   175     /**
       
   176      * Constructor. The RFC822 defined delimiters - RFC822 - are
       
   177      * used to delimit ATOMS. Also comments are skipped and not
       
   178      * returned as tokens
       
   179      */
       
   180     public HeaderTokenizer(String header)  {
       
   181         this(header, RFC822);
       
   182     }
       
   183 
       
   184     /**
       
   185      * Parses the next token from this String. <p>
       
   186      *
       
   187      * Clients sit in a loop calling next() to parse successive
       
   188      * tokens until an EOF Token is returned.
       
   189      *
       
   190      * @return          the next Token
       
   191      * @exception       ParseException if the parse fails
       
   192      */
       
   193     public Token next() throws ParseException {
       
   194         Token tk;
       
   195 
       
   196         currentPos = nextPos; // setup currentPos
       
   197         tk = getNext();
       
   198         nextPos = peekPos = currentPos; // update currentPos and peekPos
       
   199         return tk;
       
   200     }
       
   201 
       
   202     /**
       
   203      * Peek at the next token, without actually removing the token
       
   204      * from the parse stream. Invoking this method multiple times
       
   205      * will return successive tokens, until <code>next()</code> is
       
   206      * called. <p>
       
   207      *
       
   208      * @return          the next Token
       
   209      * @exception       ParseException if the parse fails
       
   210      */
       
   211     public Token peek() throws ParseException {
       
   212         Token tk;
       
   213 
       
   214         currentPos = peekPos; // setup currentPos
       
   215         tk = getNext();
       
   216         peekPos = currentPos; // update peekPos
       
   217         return tk;
       
   218     }
       
   219 
       
   220     /**
       
   221      * Return the rest of the Header.
       
   222      *
       
   223      * @return String   rest of header. null is returned if we are
       
   224      *                  already at end of header
       
   225      */
       
   226     public String getRemainder() {
       
   227         return string.substring(nextPos);
       
   228     }
       
   229 
       
   230     /*
       
   231      * Return the next token starting from 'currentPos'. After the
       
   232      * parse, 'currentPos' is updated to point to the start of the
       
   233      * next token.
       
   234      */
       
   235     private Token getNext() throws ParseException {
       
   236         // If we're already at end of string, return EOF
       
   237         if (currentPos >= maxPos)
       
   238             return EOFToken;
       
   239 
       
   240         // Skip white-space, position currentPos beyond the space
       
   241         if (skipWhiteSpace() == Token.EOF)
       
   242             return EOFToken;
       
   243 
       
   244         char c;
       
   245         int start;
       
   246         boolean filter = false;
       
   247 
       
   248         c = string.charAt(currentPos);
       
   249 
       
   250         // Check or Skip comments and position currentPos
       
   251         // beyond the comment
       
   252         while (c == '(') {
       
   253             // Parsing comment ..
       
   254             int nesting;
       
   255             for (start = ++currentPos, nesting = 1;
       
   256                  nesting > 0 && currentPos < maxPos;
       
   257                  currentPos++) {
       
   258                 c = string.charAt(currentPos);
       
   259                 if (c == '\\') {  // Escape sequence
       
   260                     currentPos++; // skip the escaped character
       
   261                     filter = true;
       
   262                 } else if (c == '\r')
       
   263                     filter = true;
       
   264                 else if (c == '(')
       
   265                     nesting++;
       
   266                 else if (c == ')')
       
   267                     nesting--;
       
   268             }
       
   269             if (nesting != 0)
       
   270                 throw new ParseException("Unbalanced comments");
       
   271 
       
   272             if (!skipComments) {
       
   273                 // Return the comment, if we are asked to.
       
   274                 // Note that the comment start & end markers are ignored.
       
   275                 String s;
       
   276                 if (filter) // need to go thru the token again.
       
   277                     s = filterToken(string, start, currentPos-1);
       
   278                 else
       
   279                     s = string.substring(start,currentPos-1);
       
   280 
       
   281                 return new Token(Token.COMMENT, s);
       
   282             }
       
   283 
       
   284             // Skip any whitespace after the comment.
       
   285             if (skipWhiteSpace() == Token.EOF)
       
   286                 return EOFToken;
       
   287             c = string.charAt(currentPos);
       
   288         }
       
   289 
       
   290         // Check for quoted-string and position currentPos
       
   291         //  beyond the terminating quote
       
   292         if (c == '"') {
       
   293             for (start = ++currentPos; currentPos < maxPos; currentPos++) {
       
   294                 c = string.charAt(currentPos);
       
   295                 if (c == '\\') { // Escape sequence
       
   296                     currentPos++;
       
   297                     filter = true;
       
   298                 } else if (c == '\r')
       
   299                     filter = true;
       
   300                 else if (c == '"') {
       
   301                     currentPos++;
       
   302                     String s;
       
   303 
       
   304                     if (filter)
       
   305                         s = filterToken(string, start, currentPos-1);
       
   306                     else
       
   307                         s = string.substring(start,currentPos-1);
       
   308 
       
   309                     return new Token(Token.QUOTEDSTRING, s);
       
   310                 }
       
   311             }
       
   312             throw new ParseException("Unbalanced quoted string");
       
   313         }
       
   314 
       
   315         // Check for SPECIAL or CTL
       
   316         if (c < 040 || c >= 0177 || delimiters.indexOf(c) >= 0) {
       
   317             currentPos++; // re-position currentPos
       
   318             char ch[] = new char[1];
       
   319             ch[0] = c;
       
   320             return new Token((int)c, new String(ch));
       
   321         }
       
   322 
       
   323         // Check for ATOM
       
   324         for (start = currentPos; currentPos < maxPos; currentPos++) {
       
   325             c = string.charAt(currentPos);
       
   326             // ATOM is delimited by either SPACE, CTL, "(", <">
       
   327             // or the specified SPECIALS
       
   328             if (c < 040 || c >= 0177 || c == '(' || c == ' ' ||
       
   329                 c == '"' || delimiters.indexOf(c) >= 0)
       
   330                 break;
       
   331         }
       
   332         return new Token(Token.ATOM, string.substring(start, currentPos));
       
   333     }
       
   334 
       
   335     // Skip SPACE, HT, CR and NL
       
   336     private int skipWhiteSpace() {
       
   337         char c;
       
   338         for (; currentPos < maxPos; currentPos++)
       
   339             if (((c = string.charAt(currentPos)) != ' ') &&
       
   340                 (c != '\t') && (c != '\r') && (c != '\n'))
       
   341                 return currentPos;
       
   342         return Token.EOF;
       
   343     }
       
   344 
       
   345     /* Process escape sequences and embedded LWSPs from a comment or
       
   346      * quoted string.
       
   347      */
       
   348     private static String filterToken(String s, int start, int end) {
       
   349         StringBuffer sb = new StringBuffer();
       
   350         char c;
       
   351         boolean gotEscape = false;
       
   352         boolean gotCR = false;
       
   353 
       
   354         for (int i = start; i < end; i++) {
       
   355             c = s.charAt(i);
       
   356             if (c == '\n' && gotCR) {
       
   357                 // This LF is part of an unescaped
       
   358                 // CRLF sequence (i.e, LWSP). Skip it.
       
   359                 gotCR = false;
       
   360                 continue;
       
   361             }
       
   362 
       
   363             gotCR = false;
       
   364             if (!gotEscape) {
       
   365                 // Previous character was NOT '\'
       
   366                 if (c == '\\') // skip this character
       
   367                     gotEscape = true;
       
   368                 else if (c == '\r') // skip this character
       
   369                     gotCR = true;
       
   370                 else // append this character
       
   371                     sb.append(c);
       
   372             } else {
       
   373                 // Previous character was '\'. So no need to
       
   374                 // bother with any special processing, just
       
   375                 // append this character
       
   376                 sb.append(c);
       
   377                 gotEscape = false;
       
   378             }
       
   379         }
       
   380         return sb.toString();
       
   381     }
       
   382 }