8027607: (rb) Provide UTF-8 based properties resource bundles
authornaoto
Wed, 29 Jul 2015 13:36:53 -0700
changeset 31902 1c90f9a5a76d
parent 31901 8724b15d3679
child 31903 752473818053
8027607: (rb) Provide UTF-8 based properties resource bundles Reviewed-by: okutsu, sherman
jdk/src/java.base/share/classes/java/util/PropertyResourceBundle.java
jdk/src/java.base/share/classes/sun/util/PropertyResourceBundleCharset.java
jdk/test/java/util/ResourceBundle/UTF8Properties/CodePointTest.java
jdk/test/java/util/ResourceBundle/UTF8Properties/IllegalSequenceTest.java
--- a/jdk/src/java.base/share/classes/java/util/PropertyResourceBundle.java	Wed Jul 29 11:47:19 2015 +0200
+++ b/jdk/src/java.base/share/classes/java/util/PropertyResourceBundle.java	Wed Jul 29 13:36:53 2015 -0700
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 1996, 2013, Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 1996, 2015, Oracle and/or its affiliates. All rights reserved.
  * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
  *
  * This code is free software; you can redistribute it and/or modify it
@@ -40,8 +40,17 @@
 package java.util;
 
 import java.io.InputStream;
+import java.io.InputStreamReader;
 import java.io.Reader;
 import java.io.IOException;
+import java.nio.charset.Charset;
+import java.nio.charset.MalformedInputException;
+import java.nio.charset.StandardCharsets;
+import java.nio.charset.UnmappableCharacterException;
+import java.security.AccessController;
+import java.util.Locale;
+import sun.security.action.GetPropertyAction;
+import sun.util.PropertyResourceBundleCharset;
 import sun.util.ResourceBundleEnumeration;
 
 /**
@@ -108,11 +117,20 @@
  * <strong>Note:</strong> PropertyResourceBundle can be constructed either
  * from an InputStream or a Reader, which represents a property file.
  * Constructing a PropertyResourceBundle instance from an InputStream requires
- * that the input stream be encoded in ISO-8859-1.  In that case, characters
- * that cannot be represented in ISO-8859-1 encoding must be represented by Unicode Escapes
- * as defined in section 3.3 of
- * <cite>The Java&trade; Language Specification</cite>
+ * that the input stream be encoded in UTF-8. By default, if a
+ * {@link java.nio.charset.MalformedInputException} or an
+ * {@link java.nio.charset.UnmappableCharacterException} occurs on reading the
+ * input stream, then the PropertyResourceBundle instance resets to the state
+ * before the exception, re-reads the input stream in {@code ISO-8859-1}, and
+ * continues reading. If the system property
+ * {@code java.util.PropertyResourceBundle.encoding} is set to either
+ * "ISO-8859-1" or "UTF-8", the input stream is solely read in that encoding,
+ * and throws the exception if it encounters an invalid sequence.
+ * If "ISO-8859-1" is specified, characters that cannot be represented in
+ * ISO-8859-1 encoding must be represented by Unicode Escapes as defined in section
+ * 3.3 of <cite>The Java&trade; Language Specification</cite>
  * whereas the other constructor which takes a Reader does not have that limitation.
+ * Other encoding values are ignored for this system property.
  *
  * @see ResourceBundle
  * @see ListResourceBundle
@@ -120,10 +138,26 @@
  * @since 1.1
  */
 public class PropertyResourceBundle extends ResourceBundle {
+
+    // Check whether the strict encoding is specified.
+    // The possible encoding is either "ISO-8859-1" or "UTF-8".
+    private static final String encoding =
+        AccessController.doPrivileged(
+            new GetPropertyAction("java.util.PropertyResourceBundle.encoding", ""))
+        .toUpperCase(Locale.ROOT);
+
     /**
      * Creates a property resource bundle from an {@link java.io.InputStream
-     * InputStream}.  The property file read with this constructor
-     * must be encoded in ISO-8859-1.
+    * InputStream}. This constructor reads the property file in UTF-8 by default.
+    * If a {@link java.nio.charset.MalformedInputException} or an
+    * {@link java.nio.charset.UnmappableCharacterException} occurs on reading the
+    * input stream, then the PropertyResourceBundle instance resets to the state
+    * before the exception, re-reads the input stream in {@code ISO-8859-1} and
+    * continues reading. If the system property
+    * {@code java.util.PropertyResourceBundle.encoding} is set to either
+    * "ISO-8859-1" or "UTF-8", the input stream is solely read in that encoding,
+    * and throws the exception if it encounters an invalid sequence. Other
+    * encoding values are ignored for this system property.
      *
      * @param stream an InputStream that represents a property file
      *        to read from.
@@ -131,12 +165,19 @@
      * @throws NullPointerException if <code>stream</code> is null
      * @throws IllegalArgumentException if {@code stream} contains a
      *     malformed Unicode escape sequence.
+     * @throws MalformedInputException if the system property
+     *     {@code java.util.PropertyResourceBundle.encoding} is set to "UTF-8"
+     *     and {@code stream} contains an invalid UTF-8 byte sequence.
+     * @throws UnmappableCharacterException if the system property
+     *     {@code java.util.PropertyResourceBundle.encoding} is set to "UTF-8"
+     *     and {@code stream} contains an unmappable UTF-8 byte sequence.
      */
     @SuppressWarnings({"unchecked", "rawtypes"})
     public PropertyResourceBundle (InputStream stream) throws IOException {
-        Properties properties = new Properties();
-        properties.load(stream);
-        lookup = new HashMap(properties);
+        this(new InputStreamReader(stream,
+            "ISO-8859-1".equals(encoding) ?
+                StandardCharsets.ISO_8859_1.newDecoder() :
+                new PropertyResourceBundleCharset("UTF-8".equals(encoding)).newDecoder()));
     }
 
     /**
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/jdk/src/java.base/share/classes/sun/util/PropertyResourceBundleCharset.java	Wed Jul 29 13:36:53 2015 -0700
@@ -0,0 +1,107 @@
+/*
+ * Copyright (c) 2015, Oracle and/or its affiliates. All rights reserved.
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This code is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License version 2 only, as
+ * published by the Free Software Foundation.  Oracle designates this
+ * particular file as subject to the "Classpath" exception as provided
+ * by Oracle in the LICENSE file that accompanied this code.
+ *
+ * This code is distributed in the hope that it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+ * version 2 for more details (a copy is included in the LICENSE file that
+ * accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License version
+ * 2 along with this work; if not, write to the Free Software Foundation,
+ * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
+ *
+ * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
+ * or visit www.oracle.com if you need additional information or have any
+ * questions.
+ */
+
+package sun.util;
+
+import java.nio.ByteBuffer;
+import java.nio.CharBuffer;
+import java.nio.charset.Charset;
+import java.nio.charset.CharsetDecoder;
+import java.nio.charset.CharsetEncoder;
+import java.nio.charset.CoderResult;
+import java.nio.charset.CodingErrorAction;
+import java.nio.charset.StandardCharsets;
+import java.util.Objects;
+import sun.util.logging.PlatformLogger;
+
+/**
+ * A Charset implementation for reading PropertyResourceBundle, in order
+ * for loading properties files. This first tries to load the properties
+ * file with UTF-8 encoding). If it fails, then load the file with ISO-8859-1
+ */
+public class PropertyResourceBundleCharset extends Charset {
+
+    private boolean strictUTF8 = false;
+
+    public PropertyResourceBundleCharset(boolean strictUTF8) {
+        this(PropertyResourceBundleCharset.class.getCanonicalName(), null);
+        this.strictUTF8 = strictUTF8;
+    }
+
+    public PropertyResourceBundleCharset(String canonicalName, String[] aliases) {
+        super(canonicalName, aliases);
+    }
+
+    @Override
+    public boolean contains(Charset cs) {
+        return false;
+    }
+
+    @Override
+    public CharsetDecoder newDecoder() {
+        return new PropertiesFileDecoder(this, 1.0f, 1.0f);
+    }
+
+    @Override
+    public CharsetEncoder newEncoder() {
+        throw new UnsupportedOperationException("Encoding is not supported");
+    }
+
+    private final class PropertiesFileDecoder extends CharsetDecoder {
+
+        private CharsetDecoder cdUTF_8 = StandardCharsets.UTF_8.newDecoder()
+                                .onMalformedInput(CodingErrorAction.REPORT)
+                                .onUnmappableCharacter(CodingErrorAction.REPORT);
+        private CharsetDecoder cdISO_8859_1 = null;
+
+        protected PropertiesFileDecoder(Charset cs,
+                float averageCharsPerByte, float maxCharsPerByte) {
+            super(cs, averageCharsPerByte, maxCharsPerByte);
+        }
+
+        protected CoderResult decodeLoop(ByteBuffer in, CharBuffer out) {
+            if (Objects.nonNull(cdISO_8859_1)) {
+                return cdISO_8859_1.decode(in, out, false);
+            }
+            in.mark();
+            out.mark();
+
+            CoderResult cr = cdUTF_8.decode(in, out, false);
+            if (cr.isUnderflow() || cr.isOverflow() ||
+                PropertyResourceBundleCharset.this.strictUTF8) {
+                return cr;
+            }
+
+            in.reset();
+            out.reset();
+
+            PlatformLogger.getLogger(getClass().getCanonicalName()).info(
+                "Invalid or unmappable UTF-8 sequence detected. " +
+                "Switching encoding from UTF-8 to ISO-8859-1");
+            cdISO_8859_1 = StandardCharsets.ISO_8859_1.newDecoder();
+            return cdISO_8859_1.decode(in, out, false);
+        }
+    }
+}
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/jdk/test/java/util/ResourceBundle/UTF8Properties/CodePointTest.java	Wed Jul 29 13:36:53 2015 -0700
@@ -0,0 +1,155 @@
+/*
+ * Copyright (c) 2015, Oracle and/or its affiliates. All rights reserved.
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This code is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License version 2 only, as
+ * published by the Free Software Foundation.
+ *
+ * This code is distributed in the hope that it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+ * version 2 for more details (a copy is included in the LICENSE file that
+ * accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License version
+ * 2 along with this work; if not, write to the Free Software Foundation,
+ * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
+ *
+ * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
+ * or visit www.oracle.com if you need additional information or have any
+ * questions.
+ */
+/*
+ * @test
+ * @bug 8027607
+ * @summary Test UTF-8 based properties files can be loaded successfully,
+ * @run main CodePointTest
+ * @run main/othervm -Djava.util.PropertyResourceBundle.encoding=ISO-8859-1 CodePointTest
+ * @run main/othervm -Djava.util.PropertyResourceBundle.encoding=UTF-8 CodePointTest
+ */
+
+import java.io.*;
+import java.nio.charset.*;
+import java.nio.file.*;
+import java.util.*;
+import static java.util.ResourceBundle.Control;
+import java.util.stream.*;
+
+/*
+ * Dumps every legal characters in ISO-8859-1/UTF-8 into
+ * a <CharSet>.properties file. Each entry has a form of
+ * "keyXXXX=c", where "XXXX" is a code point (variable length)
+ * and "c" is the character encoded in the passed character set.
+ * Then, load it with ResourceBundle.Control.newBundle() and compare both
+ * contents. This confirms the following two functions:
+ *  - For UTF-8.properties, UTF-8 code points are loaded correctly
+ *  - For ISO-8859-1.properties, UTF-8->ISO-8859-1 fallback works
+ *
+ * Does the same test with "java.util.PropertyResourceBundle.encoding"
+ * to "ISO-8859-1", and confirms only UTF-8 properties loading fails.
+ */
+public class CodePointTest {
+    static final Charset[] props = {StandardCharsets.ISO_8859_1,
+                                    StandardCharsets.UTF_8,
+                                    StandardCharsets.US_ASCII};
+    static final String encoding =
+        System.getProperty("java.util.PropertyResourceBundle.encoding", "");
+
+    public static void main(String[] args) {
+        for (Charset cs : props) {
+            try {
+                checkProps(cs,
+                    cs == StandardCharsets.UTF_8 &&
+                    encoding.equals("ISO-8859-1"));
+
+                if (cs == StandardCharsets.ISO_8859_1 &&
+                    encoding.equals("UTF-8")) {
+                    // should not happen
+                    throw new RuntimeException("Reading ISO-8859-1 properties in "+
+                        "strict UTF-8 encoding should throw an exception");
+                }
+            } catch (IOException e) {
+                if ((e instanceof MalformedInputException ||
+                     e instanceof UnmappableCharacterException) &&
+                    cs == StandardCharsets.ISO_8859_1 &&
+                    encoding.equals("UTF-8")) {
+                    // Expected exception is correctly detected.
+                } else {
+                    throw new RuntimeException(e);
+                }
+            }
+        }
+    }
+
+    static void checkProps(Charset cs, boolean shouldFail) throws IOException {
+        int start = Character.MIN_CODE_POINT;
+        int end= 0;
+
+        switch (cs.name()) {
+        case "ISO-8859-1":
+            end = 0xff;
+            break;
+        case "UTF-8":
+            end = Character.MAX_CODE_POINT;
+            break;
+        case "US-ASCII":
+            end = 0x7f;
+            break;
+        default:
+            assert false;
+        }
+
+        Properties p = new Properties();
+        String outputName = cs.name() + ".properties";
+
+        // Forget previous test artifacts
+        ResourceBundle.clearCache();
+
+        IntStream.range(start, end+1).forEach(c ->
+            {
+                if (Character.isDefined(c) &&
+                    (Character.isSupplementaryCodePoint(c) ||
+                     !Character.isSurrogate((char)c))) {
+                    p.setProperty("key"+Integer.toHexString(c),
+                        Character.isSupplementaryCodePoint(c) ?
+                            String.valueOf(Character.toChars(c)) :
+                            Character.toString((char)c));
+                }
+            }
+        );
+
+        try (BufferedWriter bw = Files.newBufferedWriter(
+                 FileSystems.getDefault().getPath(System.getProperty("test.classes", "."),
+                 outputName), cs)) {
+            p.store(bw, null);
+        } catch (IOException ex) {
+            throw new RuntimeException(ex);
+        }
+
+        // try loading it
+        Control c = Control.getControl(Control.FORMAT_PROPERTIES);
+        ResourceBundle rb;
+        try {
+            rb = c.newBundle(cs.name(), Locale.ROOT, "java.properties",
+                        CodePointTest.class.getClassLoader(), false);
+        } catch (IllegalAccessException |
+                 InstantiationException ex) {
+            throw new RuntimeException(ex);
+        }
+        Properties result = new Properties();
+        rb.keySet().stream().forEach((key) -> {
+            result.setProperty(key, rb.getString(key));
+        });
+
+        if (!p.equals(result) && !shouldFail) {
+            System.out.println("Charset: "+cs);
+            rb.keySet().stream().sorted().forEach((key) -> {
+                if (!p.getProperty(key).equals(result.getProperty(key))) {
+                    System.out.println(key+": file: "+p.getProperty(key)+", RB: "+result.getProperty(key));
+                }
+            });
+            throw new RuntimeException("not equal!");
+        }
+    }
+}
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/jdk/test/java/util/ResourceBundle/UTF8Properties/IllegalSequenceTest.java	Wed Jul 29 13:36:53 2015 -0700
@@ -0,0 +1,56 @@
+/*
+ * Copyright (c) 2015, Oracle and/or its affiliates. All rights reserved.
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This code is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License version 2 only, as
+ * published by the Free Software Foundation.
+ *
+ * This code is distributed in the hope that it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+ * version 2 for more details (a copy is included in the LICENSE file that
+ * accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License version
+ * 2 along with this work; if not, write to the Free Software Foundation,
+ * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
+ *
+ * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
+ * or visit www.oracle.com if you need additional information or have any
+ * questions.
+ */
+/*
+ * @test
+ * @bug 8027607
+ * @summary Test whether illegal UTF-8 sequences are handled correctly.
+ * @run main/othervm -Djava.util.PropertyResourceBundle.encoding=UTF-8 IllegalSequenceTest
+ */
+
+import java.io.*;
+import java.nio.charset.*;
+import java.util.*;
+
+public class IllegalSequenceTest {
+    static final byte[][] illegalSequences = {
+        {(byte)0xc0, (byte)0xaf}, // non-shortest UTF-8
+        {(byte)0xc2, (byte)0xe0}, // consecutive leading bytes
+        {(byte)0xc2, (byte)0x80, (byte)0x80}, // two byte leading + 2 trailing
+        {(byte)0xe0, (byte)0x80}, // three byte leading + 1 trailing
+        {(byte)0xf4, (byte)0x90, (byte)0x80, (byte)0x80}, // 0x110000 (over U+10FFFF)
+    };
+
+    public static void main(String[] args) throws IOException {
+        for (byte[] illegalSec: illegalSequences) {
+            try (InputStream is = new ByteArrayInputStream(illegalSec)) {
+                ResourceBundle rb = new PropertyResourceBundle(is);
+                rb.getString("key");
+            } catch (MalformedInputException |
+                    UnmappableCharacterException e) {
+                // success
+                continue;
+            }
+            throw new RuntimeException("Excepted exception was not thrown.");
+        }
+    }
+}