8222955: Optimize String.replace(CharSequence, CharSequence) for common cases
Reviewed-by: redestad, tvaleev
--- a/src/java.base/share/classes/java/lang/String.java Mon May 06 18:01:01 2019 -0400
+++ b/src/java.base/share/classes/java/lang/String.java Mon May 06 18:07:55 2019 -0700
@@ -75,7 +75,7 @@
* System.out.println("abc");
* String cde = "cde";
* System.out.println("abc" + cde);
- * String c = "abc".substring(2,3);
+ * String c = "abc".substring(2, 3);
* String d = cde.substring(1, 2);
* </pre></blockquote>
* <p>
@@ -2160,27 +2160,48 @@
* @since 1.5
*/
public String replace(CharSequence target, CharSequence replacement) {
- String tgtStr = target.toString();
+ String trgtStr = target.toString();
String replStr = replacement.toString();
- int j = indexOf(tgtStr);
- if (j < 0) {
- return this;
- }
- int tgtLen = tgtStr.length();
- int tgtLen1 = Math.max(tgtLen, 1);
int thisLen = length();
+ int trgtLen = trgtStr.length();
+ int replLen = replStr.length();
+
+ if (trgtLen > 0) {
+ if (trgtLen == 1 && replLen == 1) {
+ return replace(trgtStr.charAt(0), replStr.charAt(0));
+ }
- int newLenHint = thisLen - tgtLen + replStr.length();
- if (newLenHint < 0) {
- throw new OutOfMemoryError();
+ boolean thisIsLatin1 = this.isLatin1();
+ boolean trgtIsLatin1 = trgtStr.isLatin1();
+ boolean replIsLatin1 = replStr.isLatin1();
+ String ret = (thisIsLatin1 && trgtIsLatin1 && replIsLatin1)
+ ? StringLatin1.replace(value, thisLen,
+ trgtStr.value, trgtLen,
+ replStr.value, replLen)
+ : StringUTF16.replace(value, thisLen, thisIsLatin1,
+ trgtStr.value, trgtLen, trgtIsLatin1,
+ replStr.value, replLen, replIsLatin1);
+ if (ret != null) {
+ return ret;
+ }
+ return this;
+
+ } else { // trgtLen == 0
+ int resultLen;
+ try {
+ resultLen = Math.addExact(thisLen, Math.multiplyExact(
+ Math.addExact(thisLen, 1), replLen));
+ } catch (ArithmeticException ignored) {
+ throw new OutOfMemoryError();
+ }
+
+ StringBuilder sb = new StringBuilder(resultLen);
+ sb.append(replStr);
+ for (int i = 0; i < thisLen; ++i) {
+ sb.append(charAt(i)).append(replStr);
+ }
+ return sb.toString();
}
- StringBuilder sb = new StringBuilder(newLenHint);
- int i = 0;
- do {
- sb.append(this, i, j).append(replStr);
- i = j + tgtLen;
- } while (j < thisLen && (j = indexOf(tgtStr, j + tgtLen1)) > 0);
- return sb.append(this, i, thisLen).toString();
}
/**
--- a/src/java.base/share/classes/java/lang/StringLatin1.java Mon May 06 18:01:01 2019 -0400
+++ b/src/java.base/share/classes/java/lang/StringLatin1.java Mon May 06 18:07:55 2019 -0700
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2015, 2018, Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 2015, 2019, Oracle and/or its affiliates. All rights reserved.
* DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
*
* This code is free software; you can redistribute it and/or modify it
@@ -42,6 +42,14 @@
final class StringLatin1 {
+ /**
+ * The maximum size of array to allocate (unless necessary).
+ * Some VMs reserve some header words in an array.
+ * Attempts to allocate larger arrays may result in
+ * OutOfMemoryError: Requested array size exceeds VM limit
+ */
+ private static final int MAX_ARRAY_SIZE = Integer.MAX_VALUE - 8;
+
public static char charAt(byte[] value, int index) {
if (index < 0 || index >= value.length) {
throw new StringIndexOutOfBoundsException(index);
@@ -304,7 +312,7 @@
}
if (i < len) {
if (canEncode(newChar)) {
- byte buf[] = new byte[len];
+ byte[] buf = StringConcatHelper.newArray(len);
for (int j = 0; j < i; j++) { // TBD arraycopy?
buf[j] = value[j];
}
@@ -330,6 +338,64 @@
return null; // for string to return this;
}
+ public static String replace(byte[] value, int valLen, byte[] targ,
+ int targLen, byte[] repl, int replLen)
+ {
+ assert targLen > 0;
+ int i, j, p = 0;
+ if (valLen == 0 || (i = indexOf(value, valLen, targ, targLen, 0)) < 0) {
+ return null; // for string to return this;
+ }
+
+ // find and store indices of substrings to replace
+ int[] pos = new int[16];
+ pos[0] = i;
+ i += targLen;
+ while ((j = indexOf(value, valLen, targ, targLen, i)) > 0) {
+ if (++p == pos.length) {
+ int cap = p + (p >> 1);
+ // overflow-conscious code
+ if (cap - MAX_ARRAY_SIZE > 0) {
+ if (p == MAX_ARRAY_SIZE) {
+ throw new OutOfMemoryError();
+ }
+ cap = MAX_ARRAY_SIZE;
+ }
+ pos = Arrays.copyOf(pos, cap);
+ }
+ pos[p] = j;
+ i = j + targLen;
+ }
+
+ int resultLen;
+ try {
+ resultLen = Math.addExact(valLen,
+ Math.multiplyExact(++p, replLen - targLen));
+ } catch (ArithmeticException ignored) {
+ throw new OutOfMemoryError();
+ }
+ if (resultLen == 0) {
+ return "";
+ }
+
+ byte[] result = StringConcatHelper.newArray(resultLen);
+ int posFrom = 0, posTo = 0;
+ for (int q = 0; q < p; ++q) {
+ int nextPos = pos[q];
+ while (posFrom < nextPos) {
+ result[posTo++] = value[posFrom++];
+ }
+ posFrom += targLen;
+ for (int k = 0; k < replLen; ++k) {
+ result[posTo++] = repl[k];
+ }
+ }
+ while (posFrom < valLen) {
+ result[posTo++] = value[posFrom++];
+ }
+ return new String(result, LATIN1);
+ }
+
// case insensitive
public static boolean regionMatchesCI(byte[] value, int toffset,
byte[] other, int ooffset, int len) {
--- a/src/java.base/share/classes/java/lang/StringUTF16.java Mon May 06 18:01:01 2019 -0400
+++ b/src/java.base/share/classes/java/lang/StringUTF16.java Mon May 06 18:07:55 2019 -0700
@@ -574,7 +574,7 @@
}
}
if (i < len) {
- byte buf[] = new byte[value.length];
+ byte[] buf = new byte[value.length];
for (int j = 0; j < i; j++) {
putChar(buf, j, getChar(value, j)); // TBD:arraycopy?
}
@@ -582,21 +582,145 @@
char c = getChar(value, i);
putChar(buf, i, c == oldChar ? newChar : c);
i++;
- }
- // Check if we should try to compress to latin1
- if (String.COMPACT_STRINGS &&
- !StringLatin1.canEncode(oldChar) &&
- StringLatin1.canEncode(newChar)) {
- byte[] val = compress(buf, 0, len);
- if (val != null) {
- return new String(val, LATIN1);
- }
- }
- return new String(buf, UTF16);
+ }
+ // Check if we should try to compress to latin1
+ if (String.COMPACT_STRINGS &&
+ !StringLatin1.canEncode(oldChar) &&
+ StringLatin1.canEncode(newChar)) {
+ byte[] val = compress(buf, 0, len);
+ if (val != null) {
+ return new String(val, LATIN1);
+ }
+ }
+ return new String(buf, UTF16);
}
return null;
}
+ public static String replace(byte[] value, int valLen, boolean valLat1,
+ byte[] targ, int targLen, boolean targLat1,
+ byte[] repl, int replLen, boolean replLat1)
+ {
+ assert targLen > 0;
+ assert !valLat1 || !targLat1 || !replLat1;
+
+ // Possible combinations of the arguments/result encodings:
+ // +---+--------+--------+--------+-----------------------+
+ // | # | VALUE | TARGET | REPL | RESULT |
+ // +===+========+========+========+=======================+
+ // | 1 | Latin1 | Latin1 | UTF16 | null or UTF16 |
+ // +---+--------+--------+--------+-----------------------+
+ // | 2 | Latin1 | UTF16 | Latin1 | null |
+ // +---+--------+--------+--------+-----------------------+
+ // | 3 | Latin1 | UTF16 | UTF16 | null |
+ // +---+--------+--------+--------+-----------------------+
+ // | 4 | UTF16 | Latin1 | Latin1 | null or UTF16 |
+ // +---+--------+--------+--------+-----------------------+
+ // | 5 | UTF16 | Latin1 | UTF16 | null or UTF16 |
+ // +---+--------+--------+--------+-----------------------+
+ // | 6 | UTF16 | UTF16 | Latin1 | null, Latin1 or UTF16 |
+ // +---+--------+--------+--------+-----------------------+
+ // | 7 | UTF16 | UTF16 | UTF16 | null or UTF16 |
+ // +---+--------+--------+--------+-----------------------+
+
+ if (String.COMPACT_STRINGS && valLat1 && !targLat1) {
+ // combinations 2 or 3
+ return null; // for string to return this;
+ }
+
+ int i = (String.COMPACT_STRINGS && valLat1)
+ ? StringLatin1.indexOf(value, targ) :
+ (String.COMPACT_STRINGS && targLat1)
+ ? indexOfLatin1(value, targ)
+ : indexOf(value, targ);
+ if (i < 0) {
+ return null; // for string to return this;
+ }
+
+ // find and store indices of substrings to replace
+ int j, p = 0;
+ int[] pos = new int[16];
+ pos[0] = i;
+ i += targLen;
+ while ((j = ((String.COMPACT_STRINGS && valLat1)
+ ? StringLatin1.indexOf(value, valLen, targ, targLen, i) :
+ (String.COMPACT_STRINGS && targLat1)
+ ? indexOfLatin1(value, valLen, targ, targLen, i)
+ : indexOf(value, valLen, targ, targLen, i))) > 0)
+ {
+ if (++p == pos.length) {
+ int cap = p + (p >> 1);
+ // overflow-conscious code
+ if (cap - MAX_ARRAY_SIZE > 0) {
+ if (p == MAX_ARRAY_SIZE) {
+ throw new OutOfMemoryError();
+ }
+ cap = MAX_ARRAY_SIZE;
+ }
+ pos = Arrays.copyOf(pos, cap);
+ }
+ pos[p] = j;
+ i = j + targLen;
+ }
+
+ int resultLen;
+ try {
+ resultLen = Math.addExact(valLen,
+ Math.multiplyExact(++p, replLen - targLen));
+ } catch (ArithmeticException ignored) {
+ throw new OutOfMemoryError();
+ }
+ if (resultLen == 0) {
+ return "";
+ }
+
+ byte[] result = newBytesFor(resultLen);
+ int posFrom = 0, posTo = 0;
+ for (int q = 0; q < p; ++q) {
+ int nextPos = pos[q];
+ if (String.COMPACT_STRINGS && valLat1) {
+ while (posFrom < nextPos) {
+ char c = (char)(value[posFrom++] & 0xff);
+ putChar(result, posTo++, c);
+ }
+ } else {
+ while (posFrom < nextPos) {
+ putChar(result, posTo++, getChar(value, posFrom++));
+ }
+ }
+ posFrom += targLen;
+ if (String.COMPACT_STRINGS && replLat1) {
+ for (int k = 0; k < replLen; ++k) {
+ char c = (char)(repl[k] & 0xff);
+ putChar(result, posTo++, c);
+ }
+ } else {
+ for (int k = 0; k < replLen; ++k) {
+ putChar(result, posTo++, getChar(repl, k));
+ }
+ }
+ }
+ if (String.COMPACT_STRINGS && valLat1) {
+ while (posFrom < valLen) {
+ char c = (char)(value[posFrom++] & 0xff);
+ putChar(result, posTo++, c);
+ }
+ } else {
+ while (posFrom < valLen) {
+ putChar(result, posTo++, getChar(value, posFrom++));
+ }
+ }
+
+ if (String.COMPACT_STRINGS && replLat1 && !targLat1) {
+ // combination 6
+ byte[] lat1Result = compress(result, 0, resultLen);
+ if (lat1Result != null) {
+ return new String(lat1Result, LATIN1);
+ }
+ }
+ return new String(result, UTF16);
+ }
+
public static boolean regionMatchesCI(byte[] value, int toffset,
byte[] other, int ooffset, int len) {
int last = toffset + len;
@@ -1430,6 +1554,15 @@
static final int MAX_LENGTH = Integer.MAX_VALUE >> 1;
+
+ /**
+ * The maximum size of array to allocate (unless necessary).
+ * Some VMs reserve some header words in an array.
+ * Attempts to allocate larger arrays may result in
+ * OutOfMemoryError: Requested array size exceeds VM limit
+ */
+ private static final int MAX_ARRAY_SIZE = Integer.MAX_VALUE - 8;
+
// Used by trusted callers. Assumes all necessary bounds checks have
// been done by the caller.
--- a/test/jdk/java/lang/String/LiteralReplace.java Mon May 06 18:01:01 2019 -0400
+++ b/test/jdk/java/lang/String/LiteralReplace.java Mon May 06 18:07:55 2019 -0700
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2015, 2017, Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 2015, 2019, Oracle and/or its affiliates. All rights reserved.
* DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
*
* This code is free software; you can redistribute it and/or modify it
@@ -22,7 +22,7 @@
*/
/* @test
- * @bug 8058779 8054307
+ * @bug 8058779 8054307 8222955
* @library /test/lib
* @build jdk.test.lib.RandomFactory
* @run testng LiteralReplace
@@ -79,6 +79,10 @@
source.replace(target, replacement);
}
+ @Test(expectedExceptions = {OutOfMemoryError.class})
+ public void testOOM() {
+ "1".repeat(65537).replace("1", "2".repeat(65537));
+ }
@DataProvider
public static Object[][] sourceTargetReplacementExpected() {
@@ -104,6 +108,14 @@
{"abcdefgh", "[a-h]", "X", "abcdefgh"},
{"aa+", "a+", "", "a"},
{"^abc$", "abc", "x", "^x$"},
+ {"abc", "b", "_", "a_c"},
+ {"abc", "bc", "_", "a_"},
+ {"abc".repeat(65537) + "end", "b", "_XYZ_", "a_XYZ_c".repeat(65537) + "end"},
+ {"abc".repeat(65537) + "end", "a", "_", "_bc".repeat(65537) + "end"},
+ {"a".repeat(65537), "a", "", ""},
+ {"ab".repeat(65537), "a", "", "b".repeat(65537)},
+ {"ab".repeat(65537), "ab", "", ""},
+ {"b" + "ab".repeat(65537), "ab", "", "b"},
// more with non-latin1 characters
{"\u4e00\u4e00\u4e00",
@@ -207,6 +219,42 @@
"\u4e00\u4e01\u4e02\u4e03\u4e04\u4e05",
"",
""},
+
+ {"\u4e01\u4e02\u4e03", "\u4e02", "\u4e02", "\u4e01\u4e02\u4e03"},
+ {"\u4e01\u4e02\u4e03", "\u4e02", "\u4e04", "\u4e01\u4e04\u4e03"},
+ {"\u4e01\u4e02\u4e03", "\u4e02", "_", "\u4e01_\u4e03"},
+ {"a\u4e02c", "\u4e02", "_", "a_c"},
+ {"\u4e01@\u4e03", "@", "_", "\u4e01_\u4e03"},
+ {"\u4e01@\u4e03", "@", "\u4e02", "\u4e01\u4e02\u4e03"},
+ {"\u4e01\u4e02\u4e03", "\u4e02\u4e03", "\u4e02\u4e03", "\u4e01\u4e02\u4e03"},
+ {"\u4e01\u4e02\u4e03", "\u4e02\u4e03", "\u4e04\u4e05", "\u4e01\u4e04\u4e05"},
+ {"\u4e01\u4e02\u4e03", "\u4e02\u4e03", "\u4e06", "\u4e01\u4e06"},
+ {"\u4e01\u4e02\u4e03", "\u4e02\u4e03", "<>", "\u4e01<>"},
+ {"@\u4e02\u4e03", "\u4e02\u4e03", "<>", "@<>"},
+ {"\u4e01@@", "\u4e01@", "", "@"},
+ {"\u4e01@@", "\u4e01@", "#", "#@"},
+ {"\u4e01@@", "\u4e01@", "\u4e09", "\u4e09@"},
+ {"\u4e01@@", "\u4e01@", "#\u4e09", "#\u4e09@"},
+ {"\u4e01\u4e02\u4e03".repeat(32771) + "end", "\u4e02", "\u4e02", "\u4e01\u4e02\u4e03".repeat(32771) + "end"},
+ {"\u4e01\u4e02\u4e03".repeat(32771) + "end", "\u4e02", "\u4e04", "\u4e01\u4e04\u4e03".repeat(32771) + "end"},
+ {"\u4e01\u4e02\u4e03".repeat(32771) + "end", "\u4e02", "\u4e04\u4e05", "\u4e01\u4e04\u4e05\u4e03".repeat(32771) + "end"},
+ {"\u4e01\u4e02\u4e03".repeat(32771) + "end", "\u4e02", "_", "\u4e01_\u4e03".repeat(32771) + "end"},
+ {"\u4e01_\u4e03".repeat(32771) + "end", "_", "_", "\u4e01_\u4e03".repeat(32771) + "end"},
+ {"\u4e01_\u4e03".repeat(32771) + "end", "_", "\u4e06", "\u4e01\u4e06\u4e03".repeat(32771) + "end"},
+ {"\u4e01_\u4e03".repeat(32771) + "end", "_", "\u4e06\u4e06", "\u4e01\u4e06\u4e06\u4e03".repeat(32771) + "end"},
+ {"X_Y".repeat(32771) + "end", "_", "\u4e07", "X\u4e07Y".repeat(32771) + "end"},
+ {"X_Y".repeat(32771) + "end", "_", "\u4e07\u4e08", "X\u4e07\u4e08Y".repeat(32771) + "end"},
+ {"X_Y".repeat(32771) + "end", "_", ".\u4e08.", "X.\u4e08.Y".repeat(32771) + "end"},
+ {"\u4e0a".repeat(32771), "\u4e0a", "", ""},
+ {"\u4e0a\u4e0b".repeat(32771), "\u4e0a", "", "\u4e0b".repeat(32771)},
+ {"\u4e0a\u4e0b".repeat(32771), "\u4e0b", "", "\u4e0a".repeat(32771)},
+ {"\u4e0a\u4e0b".repeat(32771), "\u4e0a\u4e0b", "", ""},
+ {"\u4e0b" + "\u4e0a\u4e0b".repeat(32771), "\u4e0a\u4e0b", "", "\u4e0b"},
+ {"\u4e0a\u4e0b".repeat(32771) + "\u4e0a", "\u4e0a\u4e0b", "", "\u4e0a"},
+ {"\u4e0b" + "\u4e0a\u4e0b".repeat(32771) + "\u4e0a", "\u4e0a\u4e0b", "", "\u4e0b\u4e0a"},
+ {"b" + "\u4e0a\u4e0b".repeat(32771), "\u4e0a\u4e0b", "", "b"},
+ {"\u4e0a\u4e0b".repeat(32771) + "a", "\u4e0a\u4e0b", "", "a"},
+ {"b" + "\u4e0a\u4e0b".repeat(32771) + "a", "\u4e0a\u4e0b", "", "ba"},
};
}
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/test/micro/org/openjdk/bench/java/lang/StringReplace.java Mon May 06 18:07:55 2019 -0700
@@ -0,0 +1,106 @@
+/*
+ * Copyright (c) 2019, Oracle and/or its affiliates. All rights reserved.
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This code is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License version 2 only, as
+ * published by the Free Software Foundation.
+ *
+ * This code is distributed in the hope that it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
+ * version 2 for more details (a copy is included in the LICENSE file that
+ * accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License version
+ * 2 along with this work; if not, write to the Free Software Foundation,
+ * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
+ *
+ * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
+ * or visit www.oracle.com if you need additional information or have any
+ * questions.
+ */
+
+package org.openjdk.bench.java.lang;
+
+import org.openjdk.jmh.annotations.*;
+import java.util.concurrent.TimeUnit;
+
+@BenchmarkMode(Mode.AverageTime)
+@OutputTimeUnit(TimeUnit.NANOSECONDS)
+@State(Scope.Benchmark)
+public class StringReplace {
+
+ public String slat1 = new String("java.lang.String");
+ public String sutf16 = new String("\u1D36\u1D43\u1D5B\u1D43 \uFF11\uFF13");
+
+ @Benchmark
+ public String replace0x1_1_Latin1() {
+ return slat1.replace("Z", "z");
+ }
+
+ @Benchmark
+ public String replace1x1_0_Latin1() {
+ return slat1.replace("g", "");
+ }
+
+ @Benchmark
+ public String replace1x1_1_Latin1() {
+ return slat1.replace("g", "k");
+ }
+
+ @Benchmark
+ public String replace1x1_2_Latin1() {
+ return slat1.replace("g", "th");
+ }
+
+ @Benchmark
+ public String replace2x1_0_Latin1() {
+ return slat1.replace(".", "");
+ }
+
+ @Benchmark
+ public String replace2x1_1_Latin1() {
+ return slat1.replace(".", "/");
+ }
+
+ @Benchmark
+ public String replace2x1_2_Latin1() {
+ return slat1.replace(".", "::");
+ }
+
+ @Benchmark
+ public String replace0x1_1_UTF16() {
+ return sutf16.replace("\u1D36", "\u1D38\u1D3A");
+ }
+
+ @Benchmark
+ public String replace1x1_0_UTF16() {
+ return sutf16.replace("\uFF11", "");
+ }
+
+ @Benchmark
+ public String replace1x1_1_UTF16() {
+ return sutf16.replace("\u1D36", "\u24BF");
+ }
+
+ @Benchmark
+ public String replace1x1_2_UTF16() {
+ return sutf16.replace("\uFF11", "\uFF11\uFF12");
+ }
+
+ @Benchmark
+ public String replace2x1_0_UTF16() {
+ return sutf16.replace("\u1D43", "");
+ }
+
+ @Benchmark
+ public String replace2x1_1_UTF16() {
+ return sutf16.replace("\u1D43", "\u1D2C");
+ }
+
+ @Benchmark
+ public String replace2x1_2_UTF16() {
+ return sutf16.replace("\u1D43", "\u21DB\u21DB");
+ }
+}