7183053: Optimize DoubleByte charset for String.getBytes()/new String(byte[])
authorsherman
Tue, 17 Jul 2012 19:57:31 -0700
changeset 13257 5cf4ff2cbb37
parent 13256 5886d7607acd
child 13258 c24614ba26d4
7183053: Optimize DoubleByte charset for String.getBytes()/new String(byte[]) Summary: DoubleByte implements sun/nio.cs/ArrayDe/Encoder interface Reviewed-by: alanb
jdk/src/share/classes/sun/nio/cs/ext/DoubleByte.java
jdk/src/share/classes/sun/nio/cs/ext/HKSCS.java
jdk/test/sun/nio/cs/StrCodingBenchmark.java
jdk/test/sun/nio/cs/StrCodingBenchmarkDB.java
jdk/test/sun/nio/cs/TestStringCoding.java
--- a/jdk/src/share/classes/sun/nio/cs/ext/DoubleByte.java	Tue Jul 17 11:01:44 2012 -0700
+++ b/jdk/src/share/classes/sun/nio/cs/ext/DoubleByte.java	Tue Jul 17 19:57:31 2012 -0700
@@ -33,6 +33,8 @@
 import java.nio.charset.CoderResult;
 import java.util.Arrays;
 import sun.nio.cs.Surrogate;
+import sun.nio.cs.ArrayDecoder;
+import sun.nio.cs.ArrayEncoder;
 import static sun.nio.cs.CharsetMapping.*;
 
 /*
@@ -107,7 +109,7 @@
     }
 
     public static class Decoder extends CharsetDecoder
-                                implements DelegatableDecoder
+                                implements DelegatableDecoder, ArrayDecoder
     {
 
         final char[][] b2c;
@@ -209,6 +211,29 @@
                 return decodeBufferLoop(src, dst);
         }
 
+        public int decode(byte[] src, int sp, int len, char[] dst) {
+            int dp = 0;
+            int sl = sp + len;
+            char repl = replacement().charAt(0);
+            while (sp < sl) {
+                int b1 = src[sp++] & 0xff;
+                char c = b2cSB[b1];
+                if (c == UNMAPPABLE_DECODING) {
+                    if (sp < sl) {
+                        int b2 = src[sp++] & 0xff;
+                        if (b2 >= b2Min && b2 <= b2Max) {
+                            c = b2c[b1][b2 - b2Min];
+                        }
+                    }
+                    if (c == UNMAPPABLE_DECODING) {
+                        c = repl;
+                    }
+                }
+                dst[dp++] = c;
+            }
+            return dp;
+        }
+
         public void implReset() {
             super.implReset();
         }
@@ -228,6 +253,7 @@
                 return UNMAPPABLE_DECODING;
             return  b2c[b1][b2 - b2Min];
         }
+
     }
 
     // IBM_EBCDIC_DBCS
@@ -367,6 +393,46 @@
                 src.position(mark);
             }
         }
+
+        public int decode(byte[] src, int sp, int len, char[] dst) {
+            int dp = 0;
+            int sl = sp + len;
+            currentState = SBCS;
+            char repl = replacement().charAt(0);
+            while (sp < sl) {
+                int b1 = src[sp++] & 0xff;
+                if (b1 == SO) {  // Shift out
+                    if (currentState != SBCS)
+                        dst[dp++] = repl;
+                    else
+                        currentState = DBCS;
+                } else if (b1 == SI) {
+                    if (currentState != DBCS)
+                        dst[dp++] = repl;
+                    else
+                        currentState = SBCS;
+                } else {
+                    char c =  UNMAPPABLE_DECODING;
+                    if (currentState == SBCS) {
+                        c = b2cSB[b1];
+                        if (c == UNMAPPABLE_DECODING)
+                            c = repl;
+                    } else {
+                        if (sl == sp) {
+                            c = repl;
+                        } else {
+                            int b2 = src[sp++] & 0xff;
+                            if (b2 < b2Min || b2 > b2Max ||
+                                (c = b2c[b1][b2 - b2Min]) == UNMAPPABLE_DECODING) {
+                                c = repl;
+                            }
+                        }
+                    }
+                    dst[dp++] = c;
+                }
+            }
+            return dp;
+        }
     }
 
     // EBCDIC_DBCS_ONLY
@@ -405,9 +471,37 @@
                 return CoderResult.malformedForLength(1);
             return CoderResult.unmappableForLength(2);
         }
+
+        public int decode(byte[] src, int sp, int len, char[] dst) {
+            int dp = 0;
+            int sl = sp + len;
+            char repl = replacement().charAt(0);
+            while (sp < sl) {
+                int b1 = src[sp++] & 0xff;
+                char c = b2cSB[b1];
+                if (c == UNMAPPABLE_DECODING) {
+                    if (sp < sl) {
+                        int b2 = src[sp++] & 0xff;
+                        if (b2 < b2Min || b2 > b2Max ||
+                            (c = b2c[b1][b2 - b2Min]) == UNMAPPABLE_DECODING) {
+                            if (b1 == SS2 || b1 == SS3) {
+                                sp--;
+                            }
+                            c = repl;
+                        }
+                    } else {
+                        c = repl;
+                    }
+                }
+                dst[dp++] = c;
+            }
+            return dp;
+        }
     }
 
-    public static class Encoder extends CharsetEncoder {
+    public static class Encoder extends CharsetEncoder
+                                implements ArrayEncoder
+    {
         final int MAX_SINGLEBYTE = 0xff;
         private final char[] c2b;
         private final char[] c2bIndex;
@@ -516,6 +610,35 @@
                 return encodeBufferLoop(src, dst);
         }
 
+        public int encode(char[] src, int sp, int len, byte[] dst) {
+            int dp = 0;
+            int sl = sp + len;
+            int dl = dst.length;
+            while (sp < sl) {
+                char c = src[sp++];
+                int bb = encodeChar(c);
+                if (bb == UNMAPPABLE_ENCODING) {
+                    if (Character.isHighSurrogate(c) && sp < sl &&
+                        Character.isLowSurrogate(src[sp])) {
+                        sp++;
+                    }
+                    byte[] repl = replacement();
+                    dst[dp++] = repl[0];
+                    if (repl.length > 1)
+                        dst[dp++] = repl[1];
+                    continue;
+                } //else
+                if (bb > MAX_SINGLEBYTE) { // DoubleByte
+                    dst[dp++] = (byte)(bb >> 8);
+                    dst[dp++] = (byte)bb;
+                } else {                          // SingleByte
+                    dst[dp++] = (byte)bb;
+                }
+
+            }
+            return dp;
+        }
+
         public int encodeChar(char ch) {
             return c2b[c2bIndex[ch >> 8] + (ch & 0xff)];
         }
@@ -604,7 +727,6 @@
         }
     }
 
-    // EBCDIC_DBCS_ONLY
     public static class Encoder_EBCDIC_DBCSONLY extends Encoder {
         Encoder_EBCDIC_DBCSONLY(Charset cs, byte[] repl,
                                 char[] c2b, char[] c2bIndex) {
@@ -619,7 +741,6 @@
         }
     }
 
-    // for IBM_EBCDIC_DBCS
     public static class Encoder_EBCDIC extends Encoder {
         static final int SBCS = 0;
         static final int DBCS = 1;
@@ -741,6 +862,47 @@
                 src.position(mark);
             }
         }
+
+        public int encode(char[] src, int sp, int len, byte[] dst) {
+            int dp = 0;
+            int sl = sp + len;
+            while (sp < sl) {
+                char c = src[sp++];
+                int bb = encodeChar(c);
+
+                if (bb == UNMAPPABLE_ENCODING) {
+                    if (Character.isHighSurrogate(c) && sp < sl &&
+                        Character.isLowSurrogate(src[sp])) {
+                        sp++;
+                    }
+                    byte[] repl = replacement();
+                    dst[dp++] = repl[0];
+                    if (repl.length > 1)
+                        dst[dp++] = repl[1];
+                    continue;
+                } //else
+                if (bb > MAX_SINGLEBYTE) {           // DoubleByte
+                    if (currentState == SBCS) {
+                        currentState = DBCS;
+                        dst[dp++] = SO;
+                    }
+                    dst[dp++] = (byte)(bb >> 8);
+                    dst[dp++] = (byte)bb;
+                } else {                             // SingleByte
+                    if (currentState == DBCS) {
+                         currentState = SBCS;
+                         dst[dp++] = SI;
+                    }
+                    dst[dp++] = (byte)bb;
+                }
+            }
+
+            if (currentState == DBCS) {
+                 currentState = SBCS;
+                 dst[dp++] = SI;
+            }
+            return dp;
+        }
     }
 
     // EUC_SIMPLE
--- a/jdk/src/share/classes/sun/nio/cs/ext/HKSCS.java	Tue Jul 17 11:01:44 2012 -0700
+++ b/jdk/src/share/classes/sun/nio/cs/ext/HKSCS.java	Tue Jul 17 19:57:31 2012 -0700
@@ -175,6 +175,40 @@
             }
         }
 
+        public int decode(byte[] src, int sp, int len, char[] dst) {
+            int dp = 0;
+            int sl = sp + len;
+            char repl = replacement().charAt(0);
+            while (sp < sl) {
+                int b1 = src[sp++] & 0xff;
+                char c = decodeSingle(b1);
+                if (c == UNMAPPABLE_DECODING) {
+                    if (sl == sp) {
+                        c = repl;
+                    } else {
+                        int b2 = src[sp++] & 0xff;
+                        if (b2 < b2Min || b2 > b2Max) {
+                            c = repl;
+                        } else if ((c = decodeDouble(b1, b2)) == UNMAPPABLE_DECODING) {
+                            c = decodeDoubleEx(b1, b2);     //supp
+                            if (c == UNMAPPABLE_DECODING) {
+                                c = decodeBig5(b1, b2);     //big5
+                                if (c == UNMAPPABLE_DECODING)
+                                    c = repl;
+                            } else {
+                                // supplementary character in u+2xxxx area
+                                dst[dp++] = Surrogate.high(0x20000 + c);
+                                dst[dp++] = Surrogate.low(0x20000 + c);
+                                continue;
+                            }
+                        }
+                    }
+                }
+                dst[dp++] = c;
+            }
+            return dp;
+        }
+
         public CoderResult decodeLoop(ByteBuffer src, CharBuffer dst) {
             if (src.hasArray() && dst.hasArray())
                 return decodeArrayLoop(src, dst);
@@ -322,6 +356,36 @@
                 return encodeBufferLoop(src, dst);
         }
 
+        public int encode(char[] src, int sp, int len, byte[] dst) {
+            int dp = 0;
+            int sl = sp + len;
+            while (sp < sl) {
+                char c = src[sp++];
+                int bb = encodeChar(c);
+                if (bb == UNMAPPABLE_ENCODING) {
+                    if (!Character.isHighSurrogate(c) || sp == sl ||
+                        !Character.isLowSurrogate(src[sp]) ||
+                        (bb = encodeSupp(Character.toCodePoint(c, src[sp++])))
+                        == UNMAPPABLE_ENCODING) {
+                        byte[] repl = replacement();
+                        dst[dp++] = repl[0];
+                        if (repl.length > 1)
+                            dst[dp++] = repl[1];
+                        continue;
+                    }
+                    sp++;
+                }
+                if (bb > MAX_SINGLEBYTE) {        // DoubleByte
+                    dst[dp++] = (byte)(bb >> 8);
+                    dst[dp++] = (byte)bb;
+                } else {                          // SingleByte
+                    dst[dp++] = (byte)bb;
+                }
+            }
+            return dp;
+        }
+
+
         static char[] C2B_UNMAPPABLE = new char[0x100];
         static {
             Arrays.fill(C2B_UNMAPPABLE, (char)UNMAPPABLE_ENCODING);
--- a/jdk/test/sun/nio/cs/StrCodingBenchmark.java	Tue Jul 17 11:01:44 2012 -0700
+++ b/jdk/test/sun/nio/cs/StrCodingBenchmark.java	Tue Jul 17 19:57:31 2012 -0700
@@ -75,7 +75,7 @@
         return nanoss;
     }
 
-    public static void time(Job ... jobs) throws Throwable {
+    public static long[] time(Job ... jobs) throws Throwable {
 
         long[] warmup = time0(jobs); // Warm up run
         long[] nanoss = time0(jobs); // Real timing run
@@ -110,6 +110,7 @@
         // Print out absolute and relative times, calibrated against first job
         for (int i = 0; i < jobs.length; i++)
             System.out.printf(format, jobs[i].name(), milliss[i], ratios[i]);
+        return milliss;
     }
 
     public static Job[] filter(Pattern filter, Job[] jobs) {
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/jdk/test/sun/nio/cs/StrCodingBenchmarkDB.java	Tue Jul 17 19:57:31 2012 -0700
@@ -0,0 +1,139 @@
+/*
+ * Copyright (c) 2009, Oracle and/or its affiliates. All rights reserved.
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This code is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License version 2 only, as
+ * published by the Free Software Foundation.
+ *
+ * This code is distributed in the hope that it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+ * version 2 for more details (a copy is included in the LICENSE file that
+ * accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License version
+ * 2 along with this work; if not, write to the Free Software Foundation,
+ * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
+ *
+ * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
+ * or visit www.oracle.com if you need additional information or have any
+ * questions.
+ */
+
+import java.util.*;
+import java.nio.*;
+import java.nio.charset.*;
+import java.util.concurrent.*;
+import java.util.regex.Pattern;
+
+public class StrCodingBenchmarkDB extends StrCodingBenchmark {
+
+
+    public static void main(String[] args) throws Throwable {
+        final int itrs = Integer.getInteger("iterations", 100000);
+        //final int itrs = Integer.getInteger("iterations", 12);
+        final int size       = Integer.getInteger("size", 2048);
+        final int subsize    = Integer.getInteger("subsize", 128);
+        final int maxchar    = Integer.getInteger("maxchar", 128);
+        final String regex = System.getProperty("filter");
+        final Pattern filter = (regex == null) ? null : Pattern.compile(regex);
+        final boolean useSecurityManager = Boolean.getBoolean("SecurityManager");
+        if (useSecurityManager)
+            System.setSecurityManager(new PermissiveSecurityManger());
+        final Random rnd = new Random();
+
+        String[] csns = new String[] {
+            "Big5",
+            "Johab",
+            "EUC_CN",
+            "EUC_KR",
+            "MS932",
+            "MS936",
+            "MS949",
+            "MS950",
+            "GBK",
+
+            "Big5_HKSCS",
+            "Big5_HKSCS_2001",
+            "Big5_Solaris",
+            "MS950_HKSCS",
+            "MS950_HKSCS_XP",
+            "IBM1364",
+            "IBM1381",
+            "IBM1383",
+            "IBM930",
+            "IBM933",
+            "IBM935",
+            "IBM937",
+            "IBM939",
+            "IBM942",
+            "IBM943",
+            "IBM948",
+            "IBM949",
+            "IBM950",
+            "IBM970",
+        };
+
+        ArrayList<long[]> sum = new ArrayList<>();
+
+        for (final String csn : csns) {
+            final Charset cs = Charset.forName(csn);
+            List<Integer> cps = new ArrayList<>(0x4000);
+            int off = 0;
+            int cp = 0;
+            int n = 0;
+            CharsetEncoder enc = cs.newEncoder();
+            while (cp < 0x10000 && n < cps.size()) {
+                if (enc.canEncode((char)cp)) {
+                    cps.add(cp);
+                    n++;
+                }
+                cp++;
+            }
+            Collections.shuffle(cps);
+            char[] ca = new char[cps.size()];
+            for (int i = 0; i < cps.size(); i++)
+                ca[i] = (char)(int)cps.get(i);
+
+
+            System.out.printf("%n--------%s---------%n", csn);
+            for (int sz = 8; sz <= 2048; sz *= 2) {
+                System.out.printf("   [len=%d]%n", sz);
+
+                final char[] chars  = Arrays.copyOf(ca, sz);
+                final String str = new String(chars);
+                final byte[] bs  = str.getBytes(cs);
+
+                Job[] jobs = {
+
+                    new Job("String decode: csn") {
+                    public void work() throws Throwable {
+                        for (int i = 0; i < itrs; i++)
+                            new String(bs, csn);
+                    }},
+
+                    new Job("String decode: cs") {
+                    public void work() throws Throwable {
+                        for (int i = 0; i < itrs; i++)
+                            new String(bs, cs);
+                    }},
+
+                    new Job("String encode: csn") {
+                    public void work() throws Throwable {
+                        for (int i = 0; i < itrs; i++)
+                            str.getBytes(csn);
+                    }},
+
+                    new Job("String encode: cs") {
+                    public void work() throws Throwable {
+                        for (int i = 0; i < itrs; i++)
+                            str.getBytes(cs);
+                    }},
+                };
+                sum.add(time(jobs));
+
+            }
+        }
+    }
+}
--- a/jdk/test/sun/nio/cs/TestStringCoding.java	Tue Jul 17 11:01:44 2012 -0700
+++ b/jdk/test/sun/nio/cs/TestStringCoding.java	Tue Jul 17 19:57:31 2012 -0700
@@ -24,7 +24,7 @@
  */
 
 /* @test
-   @bug 6636323 6636319 7040220 7096080
+   @bug 6636323 6636319 7040220 7096080 7183053
    @summary Test if StringCoding and NIO result have the same de/encoding result
  * @run main/othervm/timeout=2000 TestStringCoding
  */
@@ -70,11 +70,62 @@
                     }
                     test(cs, Arrays.copyOf(bmpCA, clen), Arrays.copyOf(sbBA, blen));
                 }
+
+                testMixed(cs);
                 System.out.println("done!");
             }
         }
     }
 
+    static void testMixed(Charset cs) throws Throwable {
+        CharsetDecoder dec = cs.newDecoder()
+            .onMalformedInput(CodingErrorAction.REPLACE)
+            .onUnmappableCharacter(CodingErrorAction.REPLACE);
+        CharsetEncoder enc = cs.newEncoder()
+            .onMalformedInput(CodingErrorAction.REPLACE)
+            .onUnmappableCharacter(CodingErrorAction.REPLACE);
+        List<Integer> cps = new ArrayList<>(0x10000);
+        int off = 0;
+        int cp = 0;
+        while (cp < 0x10000) {
+            if (enc.canEncode((char)cp)) {
+               cps.add(cp);
+            }
+            cp++;
+        }
+        Collections.shuffle(cps);
+        char[] bmpCA = new char[cps.size()];
+        for (int i = 0; i < cps.size(); i++)
+            bmpCA[i] = (char)(int)cps.get(i);
+        String bmpStr = new String(bmpCA);
+        //getBytes(csn);
+        byte[] bmpBA = bmpStr.getBytes(cs.name());
+        ByteBuffer bf = enc.reset().encode(CharBuffer.wrap(bmpCA));
+        byte[] baNIO = new byte[bf.limit()];
+        bf.get(baNIO, 0, baNIO.length);
+        if (!Arrays.equals(bmpBA, baNIO)) {
+            throw new RuntimeException("getBytes(csn) failed  -> " + cs.name());
+        }
+
+        //getBytes(cs);
+        bmpBA = bmpStr.getBytes(cs);
+        if (!Arrays.equals(bmpBA, baNIO))
+            throw new RuntimeException("getBytes(cs) failed  -> " + cs.name());
+
+        //new String(csn);
+        String strSC = new String(bmpBA, cs.name());
+        String strNIO = dec.reset().decode(ByteBuffer.wrap(bmpBA)).toString();
+        if(!strNIO.equals(strSC)) {
+            throw new RuntimeException("new String(csn) failed  -> " + cs.name());
+        }
+
+        //new String(cs);
+        strSC = new String(bmpBA, cs);
+        if (!strNIO.equals(strSC))
+            throw new RuntimeException("new String(cs) failed  -> " + cs.name());
+
+    }
+
     static void test(Charset cs, char[] bmpCA, byte[] sbBA) throws Throwable {
         String bmpStr = new String(bmpCA);
         CharsetDecoder dec = cs.newDecoder()
@@ -100,6 +151,7 @@
         //new String(csn);
         String strSC = new String(sbBA, cs.name());
         String strNIO = dec.reset().decode(ByteBuffer.wrap(sbBA)).toString();
+
         if(!strNIO.equals(strSC))
             throw new RuntimeException("new String(csn) failed  -> " + cs.name());
 
@@ -112,7 +164,7 @@
         if (enc instanceof sun.nio.cs.ArrayEncoder &&
             cs.contains(Charset.forName("ASCII"))) {
             if (cs.name().equals("UTF-8") ||     // utf8 handles surrogates
-                cs.name().equals("CESU-8"))       // utf8 handles surrogates
+                cs.name().equals("CESU-8"))      // utf8 handles surrogates
                 return;
             enc.replaceWith(new byte[] { (byte)'A'});
             sun.nio.cs.ArrayEncoder cae = (sun.nio.cs.ArrayEncoder)enc;
@@ -137,12 +189,16 @@
                                                        cs.name())))
                 throw new RuntimeException("encode3(surrogates) failed  -> "
                                            + cs.name());
+            /* sun.nio.cs.ArrayDeEncoder works on the assumption that the
+               invoker (StringCoder) allocates enough output buf, utf8
+               and double-byte coder does not check the output buffer limit.
             ba = new byte[str.length() - 1];
             n = cae.encode(str.toCharArray(), 0, str.length(), ba);
-            if (n != 7 || !"abABABc".equals(new String(ba, 0, n,
-                                                      cs.name())))
+            if (n != 7 || !"abABABc".equals(new String(ba, 0, n, cs.name()))) {
                 throw new RuntimeException("encode4(surrogates) failed  -> "
                                            + cs.name());
+            }
+            */
         }
 
     }