8039751: UTF-8 decoder fails to handle some edge cases correctly
authorsherman
Sat, 12 Apr 2014 14:38:50 -0700
changeset 23880 7d6b060131d3
parent 23879 284802a2d355
child 23881 f4fc1826e04a
8039751: UTF-8 decoder fails to handle some edge cases correctly Summary: to update decoder.isMalformed4_2() to correctly detect out of range 2nd byte Reviewed-by: alanb
jdk/src/share/classes/sun/nio/cs/UTF_8.java
jdk/test/sun/nio/cs/TestUTF8.java
--- a/jdk/src/share/classes/sun/nio/cs/UTF_8.java	Sat Apr 12 20:21:09 2014 +0100
+++ b/jdk/src/share/classes/sun/nio/cs/UTF_8.java	Sat Apr 12 14:38:50 2014 -0700
@@ -111,12 +111,18 @@
                    (b4 & 0xc0) != 0x80;
         }
 
-        // only used when there is less than 4 bytes left in src buffer
+        // only used when there is less than 4 bytes left in src buffer.
+        // both b1 and b2 should be "& 0xff" before passed in.
         private static boolean isMalformed4_2(int b1, int b2) {
-            return (b1 == 0xf0 && b2 == 0x90) ||
+            return (b1 == 0xf0 && (b2  < 0x90 || b2 > 0xbf)) ||
+                   (b1 == 0xf4 && (b2 & 0xf0) != 0x80) ||
                    (b2 & 0xc0) != 0x80;
         }
 
+        // tests if b1 and b2 are malformed as the first 2 bytes of a
+        // legal`4-byte utf-8 byte sequence.
+        // only used when there is less than 4 bytes left in src buffer,
+        // after isMalformed4_2 has been invoked.
         private static boolean isMalformed4_3(int b3) {
             return (b3 & 0xc0) != 0x80;
         }
@@ -280,7 +286,9 @@
                     // 4 bytes, 21 bits: 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx
                     int srcRemaining = sl - sp;
                     if (srcRemaining < 4 || dl - dp < 2) {
-                        if (srcRemaining > 1 && isMalformed4_2(b1, sa[sp + 1]))
+                        b1 &= 0xff;
+                        if (b1 > 0xf4 ||
+                            srcRemaining > 1 && isMalformed4_2(b1, sa[sp + 1] & 0xff))
                             return malformedForLength(src, sp, dst, dp, 1);
                         if (srcRemaining > 2 && isMalformed4_3(sa[sp + 2]))
                             return malformedForLength(src, sp, dst, dp, 2);
@@ -363,7 +371,9 @@
                     // 4 bytes, 21 bits: 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx
                     int srcRemaining = limit - mark;
                     if (srcRemaining < 4 || dst.remaining() < 2) {
-                        if (srcRemaining > 1 && isMalformed4_2(b1, src.get()))
+                        b1 &= 0xff;
+                        if (b1 > 0xf4 ||
+                            srcRemaining > 1 && isMalformed4_2(b1, src.get() & 0xff))
                             return malformedForLength(src, mark, 1);
                         if (srcRemaining > 2 && isMalformed4_3(src.get()))
                             return malformedForLength(src, mark, 2);
@@ -518,8 +528,9 @@
                     }
                     if (malformedInputAction() != CodingErrorAction.REPLACE)
                         return -1;
-
-                    if (sp  < sl && isMalformed4_2(b1, sa[sp])) {
+                    b1 &= 0xff;
+                    if (b1 > 0xf4 ||
+                        sp  < sl && isMalformed4_2(b1, sa[sp] & 0xff)) {
                         da[dp++] = replacement().charAt(0);
                         continue;
                     }
--- a/jdk/test/sun/nio/cs/TestUTF8.java	Sat Apr 12 20:21:09 2014 +0100
+++ b/jdk/test/sun/nio/cs/TestUTF8.java	Sat Apr 12 14:38:50 2014 -0700
@@ -23,7 +23,7 @@
 
 /*
  * @test
- * @bug 4486841 7040220 7096080
+ * @bug 4486841 7040220 7096080 8039751
  * @summary Test UTF-8 charset
  */
 
@@ -291,14 +291,18 @@
         {1, (byte)0xE0, (byte)0xC0, (byte)0xBF }, // invalid second byte
         {2, (byte)0xE0, (byte)0xA0, (byte)0x7F }, // invalid third byte
         {2, (byte)0xE0, (byte)0xA0, (byte)0xC0 }, // invalid third byte
+        {2, (byte)0xE1, (byte)0x80, (byte)0x42},  // invalid third byte
+
         {1, (byte)0xFF, (byte)0xFF, (byte)0xFF }, // all ones
         {1, (byte)0xE0, (byte)0xC0, (byte)0x80 }, // invalid second byte
         {1, (byte)0xE0, (byte)0x80, (byte)0xC0 }, // invalid first byte
         {1, (byte)0xE0, (byte)0x41,},             // invalid second byte & 2 bytes
+        {1, (byte)0xE1, (byte)0x40,},             // invalid second byte & 2 bytes
         {3, (byte)0xED, (byte)0xAE, (byte)0x80 }, // 3 bytes surrogate
         {3, (byte)0xED, (byte)0xB0, (byte)0x80 }, // 3 bytes surrogate
 
 
+
         // Four-byte sequences
         {1, (byte)0xF0, (byte)0x80, (byte)0x80, (byte)0x80 }, // U+0000 zero-padded
         {1, (byte)0xF0, (byte)0x80, (byte)0x81, (byte)0xBF }, // U+007F zero-padded
@@ -323,6 +327,32 @@
         {1, (byte)0xF4, (byte)0xC0, (byte)0x80, (byte)0xC0 }, // out-range 4-byte
         {1, (byte)0xF5, (byte)0x80, (byte)0x80, (byte)0xC0 }, // out-range 4-byte
 
+        // #8039751
+        {1, (byte)0xF6, (byte)0x80, (byte)0x80, (byte)0x80 }, // out-range 1st byte
+        {1, (byte)0xF6, (byte)0x80, (byte)0x80,  },
+        {1, (byte)0xF6, (byte)0x80, },
+        {1, (byte)0xF6, },
+        {1, (byte)0xF5, (byte)0x80, (byte)0x80, (byte)0x80 }, // out-range 1st byte
+        {1, (byte)0xF5, (byte)0x80, (byte)0x80,  },
+        {1, (byte)0xF5, (byte)0x80,  },
+        {1, (byte)0xF5  },
+
+        {1, (byte)0xF4, (byte)0x90, (byte)0x80, (byte)0x80 }, // out-range 2nd byte
+        {1, (byte)0xF4, (byte)0x90, (byte)0x80 },
+        {1, (byte)0xF4, (byte)0x90 },
+
+        {1, (byte)0xF4, (byte)0x7f, (byte)0x80, (byte)0x80 }, // out-range/ascii 2nd byte
+        {1, (byte)0xF4, (byte)0x7f, (byte)0x80 },
+        {1, (byte)0xF4, (byte)0x7f },
+
+        {1, (byte)0xF0, (byte)0x80, (byte)0x80, (byte)0x80 }, // out-range 2nd byte
+        {1, (byte)0xF0, (byte)0x80, (byte)0x80 },
+        {1, (byte)0xF0, (byte)0x80 },
+
+        {1, (byte)0xF0, (byte)0xc0, (byte)0x80, (byte)0x80 }, // out-range 2nd byte
+        {1, (byte)0xF0, (byte)0xc0, (byte)0x80 },
+        {1, (byte)0xF0, (byte)0xc0 },
+
         // Five-byte sequences
         {1, (byte)0xF8, (byte)0x80, (byte)0x80, (byte)0x80, (byte)0x80},  // invalid first byte
         {1, (byte)0xF8, (byte)0x80, (byte)0x80, (byte)0x80, (byte)0x80 }, // U+0000 zero-padded
@@ -553,7 +583,6 @@
         check4ByteSurrs("UTF-8");
         checkMalformed("UTF-8", malformed);
         checkUnderOverflow("UTF-8");
-
         checkRoundtrip("CESU-8");
         check6ByteSurrs("CESU-8");
         checkMalformed("CESU-8", malformed_cesu8);