jdk/src/java.base/share/classes/java/util/StringTokenizer.java
changeset 25859 3317bb8137f4
parent 24865 09b1d992ca72
child 32108 aa5490a167ee
equal deleted inserted replaced
25858:836adbf7a2cd 25859:3317bb8137f4
       
     1 /*
       
     2  * Copyright (c) 1994, 2004, Oracle and/or its affiliates. All rights reserved.
       
     3  * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
       
     4  *
       
     5  * This code is free software; you can redistribute it and/or modify it
       
     6  * under the terms of the GNU General Public License version 2 only, as
       
     7  * published by the Free Software Foundation.  Oracle designates this
       
     8  * particular file as subject to the "Classpath" exception as provided
       
     9  * by Oracle in the LICENSE file that accompanied this code.
       
    10  *
       
    11  * This code is distributed in the hope that it will be useful, but WITHOUT
       
    12  * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
       
    13  * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
       
    14  * version 2 for more details (a copy is included in the LICENSE file that
       
    15  * accompanied this code).
       
    16  *
       
    17  * You should have received a copy of the GNU General Public License version
       
    18  * 2 along with this work; if not, write to the Free Software Foundation,
       
    19  * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
       
    20  *
       
    21  * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
       
    22  * or visit www.oracle.com if you need additional information or have any
       
    23  * questions.
       
    24  */
       
    25 
       
    26 package java.util;
       
    27 
       
    28 import java.lang.*;
       
    29 
       
    30 /**
       
    31  * The string tokenizer class allows an application to break a
       
    32  * string into tokens. The tokenization method is much simpler than
       
    33  * the one used by the <code>StreamTokenizer</code> class. The
       
    34  * <code>StringTokenizer</code> methods do not distinguish among
       
    35  * identifiers, numbers, and quoted strings, nor do they recognize
       
    36  * and skip comments.
       
    37  * <p>
       
    38  * The set of delimiters (the characters that separate tokens) may
       
    39  * be specified either at creation time or on a per-token basis.
       
    40  * <p>
       
    41  * An instance of <code>StringTokenizer</code> behaves in one of two
       
    42  * ways, depending on whether it was created with the
       
    43  * <code>returnDelims</code> flag having the value <code>true</code>
       
    44  * or <code>false</code>:
       
    45  * <ul>
       
    46  * <li>If the flag is <code>false</code>, delimiter characters serve to
       
    47  *     separate tokens. A token is a maximal sequence of consecutive
       
    48  *     characters that are not delimiters.
       
    49  * <li>If the flag is <code>true</code>, delimiter characters are themselves
       
    50  *     considered to be tokens. A token is thus either one delimiter
       
    51  *     character, or a maximal sequence of consecutive characters that are
       
    52  *     not delimiters.
       
    53  * </ul><p>
       
    54  * A <tt>StringTokenizer</tt> object internally maintains a current
       
    55  * position within the string to be tokenized. Some operations advance this
       
    56  * current position past the characters processed.<p>
       
    57  * A token is returned by taking a substring of the string that was used to
       
    58  * create the <tt>StringTokenizer</tt> object.
       
    59  * <p>
       
    60  * The following is one example of the use of the tokenizer. The code:
       
    61  * <blockquote><pre>
       
    62  *     StringTokenizer st = new StringTokenizer("this is a test");
       
    63  *     while (st.hasMoreTokens()) {
       
    64  *         System.out.println(st.nextToken());
       
    65  *     }
       
    66  * </pre></blockquote>
       
    67  * <p>
       
    68  * prints the following output:
       
    69  * <blockquote><pre>
       
    70  *     this
       
    71  *     is
       
    72  *     a
       
    73  *     test
       
    74  * </pre></blockquote>
       
    75  *
       
    76  * <p>
       
    77  * <tt>StringTokenizer</tt> is a legacy class that is retained for
       
    78  * compatibility reasons although its use is discouraged in new code. It is
       
    79  * recommended that anyone seeking this functionality use the <tt>split</tt>
       
    80  * method of <tt>String</tt> or the java.util.regex package instead.
       
    81  * <p>
       
    82  * The following example illustrates how the <tt>String.split</tt>
       
    83  * method can be used to break up a string into its basic tokens:
       
    84  * <blockquote><pre>
       
    85  *     String[] result = "this is a test".split("\\s");
       
    86  *     for (int x=0; x&lt;result.length; x++)
       
    87  *         System.out.println(result[x]);
       
    88  * </pre></blockquote>
       
    89  * <p>
       
    90  * prints the following output:
       
    91  * <blockquote><pre>
       
    92  *     this
       
    93  *     is
       
    94  *     a
       
    95  *     test
       
    96  * </pre></blockquote>
       
    97  *
       
    98  * @author  unascribed
       
    99  * @see     java.io.StreamTokenizer
       
   100  * @since   1.0
       
   101  */
       
   102 public
       
   103 class StringTokenizer implements Enumeration<Object> {
       
   104     private int currentPosition;
       
   105     private int newPosition;
       
   106     private int maxPosition;
       
   107     private String str;
       
   108     private String delimiters;
       
   109     private boolean retDelims;
       
   110     private boolean delimsChanged;
       
   111 
       
   112     /**
       
   113      * maxDelimCodePoint stores the value of the delimiter character with the
       
   114      * highest value. It is used to optimize the detection of delimiter
       
   115      * characters.
       
   116      *
       
   117      * It is unlikely to provide any optimization benefit in the
       
   118      * hasSurrogates case because most string characters will be
       
   119      * smaller than the limit, but we keep it so that the two code
       
   120      * paths remain similar.
       
   121      */
       
   122     private int maxDelimCodePoint;
       
   123 
       
   124     /**
       
   125      * If delimiters include any surrogates (including surrogate
       
   126      * pairs), hasSurrogates is true and the tokenizer uses the
       
   127      * different code path. This is because String.indexOf(int)
       
   128      * doesn't handle unpaired surrogates as a single character.
       
   129      */
       
   130     private boolean hasSurrogates = false;
       
   131 
       
   132     /**
       
   133      * When hasSurrogates is true, delimiters are converted to code
       
   134      * points and isDelimiter(int) is used to determine if the given
       
   135      * codepoint is a delimiter.
       
   136      */
       
   137     private int[] delimiterCodePoints;
       
   138 
       
   139     /**
       
   140      * Set maxDelimCodePoint to the highest char in the delimiter set.
       
   141      */
       
   142     private void setMaxDelimCodePoint() {
       
   143         if (delimiters == null) {
       
   144             maxDelimCodePoint = 0;
       
   145             return;
       
   146         }
       
   147 
       
   148         int m = 0;
       
   149         int c;
       
   150         int count = 0;
       
   151         for (int i = 0; i < delimiters.length(); i += Character.charCount(c)) {
       
   152             c = delimiters.charAt(i);
       
   153             if (c >= Character.MIN_HIGH_SURROGATE && c <= Character.MAX_LOW_SURROGATE) {
       
   154                 c = delimiters.codePointAt(i);
       
   155                 hasSurrogates = true;
       
   156             }
       
   157             if (m < c)
       
   158                 m = c;
       
   159             count++;
       
   160         }
       
   161         maxDelimCodePoint = m;
       
   162 
       
   163         if (hasSurrogates) {
       
   164             delimiterCodePoints = new int[count];
       
   165             for (int i = 0, j = 0; i < count; i++, j += Character.charCount(c)) {
       
   166                 c = delimiters.codePointAt(j);
       
   167                 delimiterCodePoints[i] = c;
       
   168             }
       
   169         }
       
   170     }
       
   171 
       
   172     /**
       
   173      * Constructs a string tokenizer for the specified string. All
       
   174      * characters in the <code>delim</code> argument are the delimiters
       
   175      * for separating tokens.
       
   176      * <p>
       
   177      * If the <code>returnDelims</code> flag is <code>true</code>, then
       
   178      * the delimiter characters are also returned as tokens. Each
       
   179      * delimiter is returned as a string of length one. If the flag is
       
   180      * <code>false</code>, the delimiter characters are skipped and only
       
   181      * serve as separators between tokens.
       
   182      * <p>
       
   183      * Note that if <tt>delim</tt> is <tt>null</tt>, this constructor does
       
   184      * not throw an exception. However, trying to invoke other methods on the
       
   185      * resulting <tt>StringTokenizer</tt> may result in a
       
   186      * <tt>NullPointerException</tt>.
       
   187      *
       
   188      * @param   str            a string to be parsed.
       
   189      * @param   delim          the delimiters.
       
   190      * @param   returnDelims   flag indicating whether to return the delimiters
       
   191      *                         as tokens.
       
   192      * @exception NullPointerException if str is <CODE>null</CODE>
       
   193      */
       
   194     public StringTokenizer(String str, String delim, boolean returnDelims) {
       
   195         currentPosition = 0;
       
   196         newPosition = -1;
       
   197         delimsChanged = false;
       
   198         this.str = str;
       
   199         maxPosition = str.length();
       
   200         delimiters = delim;
       
   201         retDelims = returnDelims;
       
   202         setMaxDelimCodePoint();
       
   203     }
       
   204 
       
   205     /**
       
   206      * Constructs a string tokenizer for the specified string. The
       
   207      * characters in the <code>delim</code> argument are the delimiters
       
   208      * for separating tokens. Delimiter characters themselves will not
       
   209      * be treated as tokens.
       
   210      * <p>
       
   211      * Note that if <tt>delim</tt> is <tt>null</tt>, this constructor does
       
   212      * not throw an exception. However, trying to invoke other methods on the
       
   213      * resulting <tt>StringTokenizer</tt> may result in a
       
   214      * <tt>NullPointerException</tt>.
       
   215      *
       
   216      * @param   str     a string to be parsed.
       
   217      * @param   delim   the delimiters.
       
   218      * @exception NullPointerException if str is <CODE>null</CODE>
       
   219      */
       
   220     public StringTokenizer(String str, String delim) {
       
   221         this(str, delim, false);
       
   222     }
       
   223 
       
   224     /**
       
   225      * Constructs a string tokenizer for the specified string. The
       
   226      * tokenizer uses the default delimiter set, which is
       
   227      * <code>"&nbsp;&#92;t&#92;n&#92;r&#92;f"</code>: the space character,
       
   228      * the tab character, the newline character, the carriage-return character,
       
   229      * and the form-feed character. Delimiter characters themselves will
       
   230      * not be treated as tokens.
       
   231      *
       
   232      * @param   str   a string to be parsed.
       
   233      * @exception NullPointerException if str is <CODE>null</CODE>
       
   234      */
       
   235     public StringTokenizer(String str) {
       
   236         this(str, " \t\n\r\f", false);
       
   237     }
       
   238 
       
   239     /**
       
   240      * Skips delimiters starting from the specified position. If retDelims
       
   241      * is false, returns the index of the first non-delimiter character at or
       
   242      * after startPos. If retDelims is true, startPos is returned.
       
   243      */
       
   244     private int skipDelimiters(int startPos) {
       
   245         if (delimiters == null)
       
   246             throw new NullPointerException();
       
   247 
       
   248         int position = startPos;
       
   249         while (!retDelims && position < maxPosition) {
       
   250             if (!hasSurrogates) {
       
   251                 char c = str.charAt(position);
       
   252                 if ((c > maxDelimCodePoint) || (delimiters.indexOf(c) < 0))
       
   253                     break;
       
   254                 position++;
       
   255             } else {
       
   256                 int c = str.codePointAt(position);
       
   257                 if ((c > maxDelimCodePoint) || !isDelimiter(c)) {
       
   258                     break;
       
   259                 }
       
   260                 position += Character.charCount(c);
       
   261             }
       
   262         }
       
   263         return position;
       
   264     }
       
   265 
       
   266     /**
       
   267      * Skips ahead from startPos and returns the index of the next delimiter
       
   268      * character encountered, or maxPosition if no such delimiter is found.
       
   269      */
       
   270     private int scanToken(int startPos) {
       
   271         int position = startPos;
       
   272         while (position < maxPosition) {
       
   273             if (!hasSurrogates) {
       
   274                 char c = str.charAt(position);
       
   275                 if ((c <= maxDelimCodePoint) && (delimiters.indexOf(c) >= 0))
       
   276                     break;
       
   277                 position++;
       
   278             } else {
       
   279                 int c = str.codePointAt(position);
       
   280                 if ((c <= maxDelimCodePoint) && isDelimiter(c))
       
   281                     break;
       
   282                 position += Character.charCount(c);
       
   283             }
       
   284         }
       
   285         if (retDelims && (startPos == position)) {
       
   286             if (!hasSurrogates) {
       
   287                 char c = str.charAt(position);
       
   288                 if ((c <= maxDelimCodePoint) && (delimiters.indexOf(c) >= 0))
       
   289                     position++;
       
   290             } else {
       
   291                 int c = str.codePointAt(position);
       
   292                 if ((c <= maxDelimCodePoint) && isDelimiter(c))
       
   293                     position += Character.charCount(c);
       
   294             }
       
   295         }
       
   296         return position;
       
   297     }
       
   298 
       
   299     private boolean isDelimiter(int codePoint) {
       
   300         for (int delimiterCodePoint : delimiterCodePoints) {
       
   301             if (delimiterCodePoint == codePoint) {
       
   302                 return true;
       
   303             }
       
   304         }
       
   305         return false;
       
   306     }
       
   307 
       
   308     /**
       
   309      * Tests if there are more tokens available from this tokenizer's string.
       
   310      * If this method returns <tt>true</tt>, then a subsequent call to
       
   311      * <tt>nextToken</tt> with no argument will successfully return a token.
       
   312      *
       
   313      * @return  <code>true</code> if and only if there is at least one token
       
   314      *          in the string after the current position; <code>false</code>
       
   315      *          otherwise.
       
   316      */
       
   317     public boolean hasMoreTokens() {
       
   318         /*
       
   319          * Temporarily store this position and use it in the following
       
   320          * nextToken() method only if the delimiters haven't been changed in
       
   321          * that nextToken() invocation.
       
   322          */
       
   323         newPosition = skipDelimiters(currentPosition);
       
   324         return (newPosition < maxPosition);
       
   325     }
       
   326 
       
   327     /**
       
   328      * Returns the next token from this string tokenizer.
       
   329      *
       
   330      * @return     the next token from this string tokenizer.
       
   331      * @exception  NoSuchElementException  if there are no more tokens in this
       
   332      *               tokenizer's string.
       
   333      */
       
   334     public String nextToken() {
       
   335         /*
       
   336          * If next position already computed in hasMoreElements() and
       
   337          * delimiters have changed between the computation and this invocation,
       
   338          * then use the computed value.
       
   339          */
       
   340 
       
   341         currentPosition = (newPosition >= 0 && !delimsChanged) ?
       
   342             newPosition : skipDelimiters(currentPosition);
       
   343 
       
   344         /* Reset these anyway */
       
   345         delimsChanged = false;
       
   346         newPosition = -1;
       
   347 
       
   348         if (currentPosition >= maxPosition)
       
   349             throw new NoSuchElementException();
       
   350         int start = currentPosition;
       
   351         currentPosition = scanToken(currentPosition);
       
   352         return str.substring(start, currentPosition);
       
   353     }
       
   354 
       
   355     /**
       
   356      * Returns the next token in this string tokenizer's string. First,
       
   357      * the set of characters considered to be delimiters by this
       
   358      * <tt>StringTokenizer</tt> object is changed to be the characters in
       
   359      * the string <tt>delim</tt>. Then the next token in the string
       
   360      * after the current position is returned. The current position is
       
   361      * advanced beyond the recognized token.  The new delimiter set
       
   362      * remains the default after this call.
       
   363      *
       
   364      * @param      delim   the new delimiters.
       
   365      * @return     the next token, after switching to the new delimiter set.
       
   366      * @exception  NoSuchElementException  if there are no more tokens in this
       
   367      *               tokenizer's string.
       
   368      * @exception NullPointerException if delim is <CODE>null</CODE>
       
   369      */
       
   370     public String nextToken(String delim) {
       
   371         delimiters = delim;
       
   372 
       
   373         /* delimiter string specified, so set the appropriate flag. */
       
   374         delimsChanged = true;
       
   375 
       
   376         setMaxDelimCodePoint();
       
   377         return nextToken();
       
   378     }
       
   379 
       
   380     /**
       
   381      * Returns the same value as the <code>hasMoreTokens</code>
       
   382      * method. It exists so that this class can implement the
       
   383      * <code>Enumeration</code> interface.
       
   384      *
       
   385      * @return  <code>true</code> if there are more tokens;
       
   386      *          <code>false</code> otherwise.
       
   387      * @see     java.util.Enumeration
       
   388      * @see     java.util.StringTokenizer#hasMoreTokens()
       
   389      */
       
   390     public boolean hasMoreElements() {
       
   391         return hasMoreTokens();
       
   392     }
       
   393 
       
   394     /**
       
   395      * Returns the same value as the <code>nextToken</code> method,
       
   396      * except that its declared return value is <code>Object</code> rather than
       
   397      * <code>String</code>. It exists so that this class can implement the
       
   398      * <code>Enumeration</code> interface.
       
   399      *
       
   400      * @return     the next token in the string.
       
   401      * @exception  NoSuchElementException  if there are no more tokens in this
       
   402      *               tokenizer's string.
       
   403      * @see        java.util.Enumeration
       
   404      * @see        java.util.StringTokenizer#nextToken()
       
   405      */
       
   406     public Object nextElement() {
       
   407         return nextToken();
       
   408     }
       
   409 
       
   410     /**
       
   411      * Calculates the number of times that this tokenizer's
       
   412      * <code>nextToken</code> method can be called before it generates an
       
   413      * exception. The current position is not advanced.
       
   414      *
       
   415      * @return  the number of tokens remaining in the string using the current
       
   416      *          delimiter set.
       
   417      * @see     java.util.StringTokenizer#nextToken()
       
   418      */
       
   419     public int countTokens() {
       
   420         int count = 0;
       
   421         int currpos = currentPosition;
       
   422         while (currpos < maxPosition) {
       
   423             currpos = skipDelimiters(currpos);
       
   424             if (currpos >= maxPosition)
       
   425                 break;
       
   426             currpos = scanToken(currpos);
       
   427             count++;
       
   428         }
       
   429         return count;
       
   430     }
       
   431 }