7183053: Optimize DoubleByte charset for String.getBytes()/new String(byte[])
Summary: DoubleByte implements sun/nio.cs/ArrayDe/Encoder interface
Reviewed-by: alanb
--- a/jdk/src/share/classes/sun/nio/cs/ext/DoubleByte.java Tue Jul 17 11:01:44 2012 -0700
+++ b/jdk/src/share/classes/sun/nio/cs/ext/DoubleByte.java Tue Jul 17 19:57:31 2012 -0700
@@ -33,6 +33,8 @@
import java.nio.charset.CoderResult;
import java.util.Arrays;
import sun.nio.cs.Surrogate;
+import sun.nio.cs.ArrayDecoder;
+import sun.nio.cs.ArrayEncoder;
import static sun.nio.cs.CharsetMapping.*;
/*
@@ -107,7 +109,7 @@
}
public static class Decoder extends CharsetDecoder
- implements DelegatableDecoder
+ implements DelegatableDecoder, ArrayDecoder
{
final char[][] b2c;
@@ -209,6 +211,29 @@
return decodeBufferLoop(src, dst);
}
+ public int decode(byte[] src, int sp, int len, char[] dst) {
+ int dp = 0;
+ int sl = sp + len;
+ char repl = replacement().charAt(0);
+ while (sp < sl) {
+ int b1 = src[sp++] & 0xff;
+ char c = b2cSB[b1];
+ if (c == UNMAPPABLE_DECODING) {
+ if (sp < sl) {
+ int b2 = src[sp++] & 0xff;
+ if (b2 >= b2Min && b2 <= b2Max) {
+ c = b2c[b1][b2 - b2Min];
+ }
+ }
+ if (c == UNMAPPABLE_DECODING) {
+ c = repl;
+ }
+ }
+ dst[dp++] = c;
+ }
+ return dp;
+ }
+
public void implReset() {
super.implReset();
}
@@ -228,6 +253,7 @@
return UNMAPPABLE_DECODING;
return b2c[b1][b2 - b2Min];
}
+
}
// IBM_EBCDIC_DBCS
@@ -367,6 +393,46 @@
src.position(mark);
}
}
+
+ public int decode(byte[] src, int sp, int len, char[] dst) {
+ int dp = 0;
+ int sl = sp + len;
+ currentState = SBCS;
+ char repl = replacement().charAt(0);
+ while (sp < sl) {
+ int b1 = src[sp++] & 0xff;
+ if (b1 == SO) { // Shift out
+ if (currentState != SBCS)
+ dst[dp++] = repl;
+ else
+ currentState = DBCS;
+ } else if (b1 == SI) {
+ if (currentState != DBCS)
+ dst[dp++] = repl;
+ else
+ currentState = SBCS;
+ } else {
+ char c = UNMAPPABLE_DECODING;
+ if (currentState == SBCS) {
+ c = b2cSB[b1];
+ if (c == UNMAPPABLE_DECODING)
+ c = repl;
+ } else {
+ if (sl == sp) {
+ c = repl;
+ } else {
+ int b2 = src[sp++] & 0xff;
+ if (b2 < b2Min || b2 > b2Max ||
+ (c = b2c[b1][b2 - b2Min]) == UNMAPPABLE_DECODING) {
+ c = repl;
+ }
+ }
+ }
+ dst[dp++] = c;
+ }
+ }
+ return dp;
+ }
}
// EBCDIC_DBCS_ONLY
@@ -405,9 +471,37 @@
return CoderResult.malformedForLength(1);
return CoderResult.unmappableForLength(2);
}
+
+ public int decode(byte[] src, int sp, int len, char[] dst) {
+ int dp = 0;
+ int sl = sp + len;
+ char repl = replacement().charAt(0);
+ while (sp < sl) {
+ int b1 = src[sp++] & 0xff;
+ char c = b2cSB[b1];
+ if (c == UNMAPPABLE_DECODING) {
+ if (sp < sl) {
+ int b2 = src[sp++] & 0xff;
+ if (b2 < b2Min || b2 > b2Max ||
+ (c = b2c[b1][b2 - b2Min]) == UNMAPPABLE_DECODING) {
+ if (b1 == SS2 || b1 == SS3) {
+ sp--;
+ }
+ c = repl;
+ }
+ } else {
+ c = repl;
+ }
+ }
+ dst[dp++] = c;
+ }
+ return dp;
+ }
}
- public static class Encoder extends CharsetEncoder {
+ public static class Encoder extends CharsetEncoder
+ implements ArrayEncoder
+ {
final int MAX_SINGLEBYTE = 0xff;
private final char[] c2b;
private final char[] c2bIndex;
@@ -516,6 +610,35 @@
return encodeBufferLoop(src, dst);
}
+ public int encode(char[] src, int sp, int len, byte[] dst) {
+ int dp = 0;
+ int sl = sp + len;
+ int dl = dst.length;
+ while (sp < sl) {
+ char c = src[sp++];
+ int bb = encodeChar(c);
+ if (bb == UNMAPPABLE_ENCODING) {
+ if (Character.isHighSurrogate(c) && sp < sl &&
+ Character.isLowSurrogate(src[sp])) {
+ sp++;
+ }
+ byte[] repl = replacement();
+ dst[dp++] = repl[0];
+ if (repl.length > 1)
+ dst[dp++] = repl[1];
+ continue;
+ } //else
+ if (bb > MAX_SINGLEBYTE) { // DoubleByte
+ dst[dp++] = (byte)(bb >> 8);
+ dst[dp++] = (byte)bb;
+ } else { // SingleByte
+ dst[dp++] = (byte)bb;
+ }
+
+ }
+ return dp;
+ }
+
public int encodeChar(char ch) {
return c2b[c2bIndex[ch >> 8] + (ch & 0xff)];
}
@@ -604,7 +727,6 @@
}
}
- // EBCDIC_DBCS_ONLY
public static class Encoder_EBCDIC_DBCSONLY extends Encoder {
Encoder_EBCDIC_DBCSONLY(Charset cs, byte[] repl,
char[] c2b, char[] c2bIndex) {
@@ -619,7 +741,6 @@
}
}
- // for IBM_EBCDIC_DBCS
public static class Encoder_EBCDIC extends Encoder {
static final int SBCS = 0;
static final int DBCS = 1;
@@ -741,6 +862,47 @@
src.position(mark);
}
}
+
+ public int encode(char[] src, int sp, int len, byte[] dst) {
+ int dp = 0;
+ int sl = sp + len;
+ while (sp < sl) {
+ char c = src[sp++];
+ int bb = encodeChar(c);
+
+ if (bb == UNMAPPABLE_ENCODING) {
+ if (Character.isHighSurrogate(c) && sp < sl &&
+ Character.isLowSurrogate(src[sp])) {
+ sp++;
+ }
+ byte[] repl = replacement();
+ dst[dp++] = repl[0];
+ if (repl.length > 1)
+ dst[dp++] = repl[1];
+ continue;
+ } //else
+ if (bb > MAX_SINGLEBYTE) { // DoubleByte
+ if (currentState == SBCS) {
+ currentState = DBCS;
+ dst[dp++] = SO;
+ }
+ dst[dp++] = (byte)(bb >> 8);
+ dst[dp++] = (byte)bb;
+ } else { // SingleByte
+ if (currentState == DBCS) {
+ currentState = SBCS;
+ dst[dp++] = SI;
+ }
+ dst[dp++] = (byte)bb;
+ }
+ }
+
+ if (currentState == DBCS) {
+ currentState = SBCS;
+ dst[dp++] = SI;
+ }
+ return dp;
+ }
}
// EUC_SIMPLE
--- a/jdk/src/share/classes/sun/nio/cs/ext/HKSCS.java Tue Jul 17 11:01:44 2012 -0700
+++ b/jdk/src/share/classes/sun/nio/cs/ext/HKSCS.java Tue Jul 17 19:57:31 2012 -0700
@@ -175,6 +175,40 @@
}
}
+ public int decode(byte[] src, int sp, int len, char[] dst) {
+ int dp = 0;
+ int sl = sp + len;
+ char repl = replacement().charAt(0);
+ while (sp < sl) {
+ int b1 = src[sp++] & 0xff;
+ char c = decodeSingle(b1);
+ if (c == UNMAPPABLE_DECODING) {
+ if (sl == sp) {
+ c = repl;
+ } else {
+ int b2 = src[sp++] & 0xff;
+ if (b2 < b2Min || b2 > b2Max) {
+ c = repl;
+ } else if ((c = decodeDouble(b1, b2)) == UNMAPPABLE_DECODING) {
+ c = decodeDoubleEx(b1, b2); //supp
+ if (c == UNMAPPABLE_DECODING) {
+ c = decodeBig5(b1, b2); //big5
+ if (c == UNMAPPABLE_DECODING)
+ c = repl;
+ } else {
+ // supplementary character in u+2xxxx area
+ dst[dp++] = Surrogate.high(0x20000 + c);
+ dst[dp++] = Surrogate.low(0x20000 + c);
+ continue;
+ }
+ }
+ }
+ }
+ dst[dp++] = c;
+ }
+ return dp;
+ }
+
public CoderResult decodeLoop(ByteBuffer src, CharBuffer dst) {
if (src.hasArray() && dst.hasArray())
return decodeArrayLoop(src, dst);
@@ -322,6 +356,36 @@
return encodeBufferLoop(src, dst);
}
+ public int encode(char[] src, int sp, int len, byte[] dst) {
+ int dp = 0;
+ int sl = sp + len;
+ while (sp < sl) {
+ char c = src[sp++];
+ int bb = encodeChar(c);
+ if (bb == UNMAPPABLE_ENCODING) {
+ if (!Character.isHighSurrogate(c) || sp == sl ||
+ !Character.isLowSurrogate(src[sp]) ||
+ (bb = encodeSupp(Character.toCodePoint(c, src[sp++])))
+ == UNMAPPABLE_ENCODING) {
+ byte[] repl = replacement();
+ dst[dp++] = repl[0];
+ if (repl.length > 1)
+ dst[dp++] = repl[1];
+ continue;
+ }
+ sp++;
+ }
+ if (bb > MAX_SINGLEBYTE) { // DoubleByte
+ dst[dp++] = (byte)(bb >> 8);
+ dst[dp++] = (byte)bb;
+ } else { // SingleByte
+ dst[dp++] = (byte)bb;
+ }
+ }
+ return dp;
+ }
+
+
static char[] C2B_UNMAPPABLE = new char[0x100];
static {
Arrays.fill(C2B_UNMAPPABLE, (char)UNMAPPABLE_ENCODING);
--- a/jdk/test/sun/nio/cs/StrCodingBenchmark.java Tue Jul 17 11:01:44 2012 -0700
+++ b/jdk/test/sun/nio/cs/StrCodingBenchmark.java Tue Jul 17 19:57:31 2012 -0700
@@ -75,7 +75,7 @@
return nanoss;
}
- public static void time(Job ... jobs) throws Throwable {
+ public static long[] time(Job ... jobs) throws Throwable {
long[] warmup = time0(jobs); // Warm up run
long[] nanoss = time0(jobs); // Real timing run
@@ -110,6 +110,7 @@
// Print out absolute and relative times, calibrated against first job
for (int i = 0; i < jobs.length; i++)
System.out.printf(format, jobs[i].name(), milliss[i], ratios[i]);
+ return milliss;
}
public static Job[] filter(Pattern filter, Job[] jobs) {
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/jdk/test/sun/nio/cs/StrCodingBenchmarkDB.java Tue Jul 17 19:57:31 2012 -0700
@@ -0,0 +1,139 @@
+/*
+ * Copyright (c) 2009, Oracle and/or its affiliates. All rights reserved.
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This code is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License version 2 only, as
+ * published by the Free Software Foundation.
+ *
+ * This code is distributed in the hope that it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
+ * version 2 for more details (a copy is included in the LICENSE file that
+ * accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License version
+ * 2 along with this work; if not, write to the Free Software Foundation,
+ * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
+ *
+ * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
+ * or visit www.oracle.com if you need additional information or have any
+ * questions.
+ */
+
+import java.util.*;
+import java.nio.*;
+import java.nio.charset.*;
+import java.util.concurrent.*;
+import java.util.regex.Pattern;
+
+public class StrCodingBenchmarkDB extends StrCodingBenchmark {
+
+
+ public static void main(String[] args) throws Throwable {
+ final int itrs = Integer.getInteger("iterations", 100000);
+ //final int itrs = Integer.getInteger("iterations", 12);
+ final int size = Integer.getInteger("size", 2048);
+ final int subsize = Integer.getInteger("subsize", 128);
+ final int maxchar = Integer.getInteger("maxchar", 128);
+ final String regex = System.getProperty("filter");
+ final Pattern filter = (regex == null) ? null : Pattern.compile(regex);
+ final boolean useSecurityManager = Boolean.getBoolean("SecurityManager");
+ if (useSecurityManager)
+ System.setSecurityManager(new PermissiveSecurityManger());
+ final Random rnd = new Random();
+
+ String[] csns = new String[] {
+ "Big5",
+ "Johab",
+ "EUC_CN",
+ "EUC_KR",
+ "MS932",
+ "MS936",
+ "MS949",
+ "MS950",
+ "GBK",
+
+ "Big5_HKSCS",
+ "Big5_HKSCS_2001",
+ "Big5_Solaris",
+ "MS950_HKSCS",
+ "MS950_HKSCS_XP",
+ "IBM1364",
+ "IBM1381",
+ "IBM1383",
+ "IBM930",
+ "IBM933",
+ "IBM935",
+ "IBM937",
+ "IBM939",
+ "IBM942",
+ "IBM943",
+ "IBM948",
+ "IBM949",
+ "IBM950",
+ "IBM970",
+ };
+
+ ArrayList<long[]> sum = new ArrayList<>();
+
+ for (final String csn : csns) {
+ final Charset cs = Charset.forName(csn);
+ List<Integer> cps = new ArrayList<>(0x4000);
+ int off = 0;
+ int cp = 0;
+ int n = 0;
+ CharsetEncoder enc = cs.newEncoder();
+ while (cp < 0x10000 && n < cps.size()) {
+ if (enc.canEncode((char)cp)) {
+ cps.add(cp);
+ n++;
+ }
+ cp++;
+ }
+ Collections.shuffle(cps);
+ char[] ca = new char[cps.size()];
+ for (int i = 0; i < cps.size(); i++)
+ ca[i] = (char)(int)cps.get(i);
+
+
+ System.out.printf("%n--------%s---------%n", csn);
+ for (int sz = 8; sz <= 2048; sz *= 2) {
+ System.out.printf(" [len=%d]%n", sz);
+
+ final char[] chars = Arrays.copyOf(ca, sz);
+ final String str = new String(chars);
+ final byte[] bs = str.getBytes(cs);
+
+ Job[] jobs = {
+
+ new Job("String decode: csn") {
+ public void work() throws Throwable {
+ for (int i = 0; i < itrs; i++)
+ new String(bs, csn);
+ }},
+
+ new Job("String decode: cs") {
+ public void work() throws Throwable {
+ for (int i = 0; i < itrs; i++)
+ new String(bs, cs);
+ }},
+
+ new Job("String encode: csn") {
+ public void work() throws Throwable {
+ for (int i = 0; i < itrs; i++)
+ str.getBytes(csn);
+ }},
+
+ new Job("String encode: cs") {
+ public void work() throws Throwable {
+ for (int i = 0; i < itrs; i++)
+ str.getBytes(cs);
+ }},
+ };
+ sum.add(time(jobs));
+
+ }
+ }
+ }
+}
--- a/jdk/test/sun/nio/cs/TestStringCoding.java Tue Jul 17 11:01:44 2012 -0700
+++ b/jdk/test/sun/nio/cs/TestStringCoding.java Tue Jul 17 19:57:31 2012 -0700
@@ -24,7 +24,7 @@
*/
/* @test
- @bug 6636323 6636319 7040220 7096080
+ @bug 6636323 6636319 7040220 7096080 7183053
@summary Test if StringCoding and NIO result have the same de/encoding result
* @run main/othervm/timeout=2000 TestStringCoding
*/
@@ -70,11 +70,62 @@
}
test(cs, Arrays.copyOf(bmpCA, clen), Arrays.copyOf(sbBA, blen));
}
+
+ testMixed(cs);
System.out.println("done!");
}
}
}
+ static void testMixed(Charset cs) throws Throwable {
+ CharsetDecoder dec = cs.newDecoder()
+ .onMalformedInput(CodingErrorAction.REPLACE)
+ .onUnmappableCharacter(CodingErrorAction.REPLACE);
+ CharsetEncoder enc = cs.newEncoder()
+ .onMalformedInput(CodingErrorAction.REPLACE)
+ .onUnmappableCharacter(CodingErrorAction.REPLACE);
+ List<Integer> cps = new ArrayList<>(0x10000);
+ int off = 0;
+ int cp = 0;
+ while (cp < 0x10000) {
+ if (enc.canEncode((char)cp)) {
+ cps.add(cp);
+ }
+ cp++;
+ }
+ Collections.shuffle(cps);
+ char[] bmpCA = new char[cps.size()];
+ for (int i = 0; i < cps.size(); i++)
+ bmpCA[i] = (char)(int)cps.get(i);
+ String bmpStr = new String(bmpCA);
+ //getBytes(csn);
+ byte[] bmpBA = bmpStr.getBytes(cs.name());
+ ByteBuffer bf = enc.reset().encode(CharBuffer.wrap(bmpCA));
+ byte[] baNIO = new byte[bf.limit()];
+ bf.get(baNIO, 0, baNIO.length);
+ if (!Arrays.equals(bmpBA, baNIO)) {
+ throw new RuntimeException("getBytes(csn) failed -> " + cs.name());
+ }
+
+ //getBytes(cs);
+ bmpBA = bmpStr.getBytes(cs);
+ if (!Arrays.equals(bmpBA, baNIO))
+ throw new RuntimeException("getBytes(cs) failed -> " + cs.name());
+
+ //new String(csn);
+ String strSC = new String(bmpBA, cs.name());
+ String strNIO = dec.reset().decode(ByteBuffer.wrap(bmpBA)).toString();
+ if(!strNIO.equals(strSC)) {
+ throw new RuntimeException("new String(csn) failed -> " + cs.name());
+ }
+
+ //new String(cs);
+ strSC = new String(bmpBA, cs);
+ if (!strNIO.equals(strSC))
+ throw new RuntimeException("new String(cs) failed -> " + cs.name());
+
+ }
+
static void test(Charset cs, char[] bmpCA, byte[] sbBA) throws Throwable {
String bmpStr = new String(bmpCA);
CharsetDecoder dec = cs.newDecoder()
@@ -100,6 +151,7 @@
//new String(csn);
String strSC = new String(sbBA, cs.name());
String strNIO = dec.reset().decode(ByteBuffer.wrap(sbBA)).toString();
+
if(!strNIO.equals(strSC))
throw new RuntimeException("new String(csn) failed -> " + cs.name());
@@ -112,7 +164,7 @@
if (enc instanceof sun.nio.cs.ArrayEncoder &&
cs.contains(Charset.forName("ASCII"))) {
if (cs.name().equals("UTF-8") || // utf8 handles surrogates
- cs.name().equals("CESU-8")) // utf8 handles surrogates
+ cs.name().equals("CESU-8")) // utf8 handles surrogates
return;
enc.replaceWith(new byte[] { (byte)'A'});
sun.nio.cs.ArrayEncoder cae = (sun.nio.cs.ArrayEncoder)enc;
@@ -137,12 +189,16 @@
cs.name())))
throw new RuntimeException("encode3(surrogates) failed -> "
+ cs.name());
+ /* sun.nio.cs.ArrayDeEncoder works on the assumption that the
+ invoker (StringCoder) allocates enough output buf, utf8
+ and double-byte coder does not check the output buffer limit.
ba = new byte[str.length() - 1];
n = cae.encode(str.toCharArray(), 0, str.length(), ba);
- if (n != 7 || !"abABABc".equals(new String(ba, 0, n,
- cs.name())))
+ if (n != 7 || !"abABABc".equals(new String(ba, 0, n, cs.name()))) {
throw new RuntimeException("encode4(surrogates) failed -> "
+ cs.name());
+ }
+ */
}
}