hotspot/src/share/vm/utilities/utf8.cpp
changeset 36508 5f9eee6b383b
parent 33628 09241459a8b8
child 40901 0c83ed47db08
--- a/hotspot/src/share/vm/utilities/utf8.cpp	Tue Mar 15 13:48:21 2016 -0700
+++ b/hotspot/src/share/vm/utilities/utf8.cpp	Thu Mar 17 19:04:01 2016 +0000
@@ -333,6 +333,68 @@
                  + ((str[4] & 0x0f) << 6)  + (str[5] & 0x3f);
 }
 
+bool UTF8::is_legal_utf8(const unsigned char* buffer, int length,
+                         bool version_leq_47) {
+  int i = 0;
+  int count = length >> 2;
+  for (int k=0; k<count; k++) {
+    unsigned char b0 = buffer[i];
+    unsigned char b1 = buffer[i+1];
+    unsigned char b2 = buffer[i+2];
+    unsigned char b3 = buffer[i+3];
+    // For an unsigned char v,
+    // (v | v - 1) is < 128 (highest bit 0) for 0 < v < 128;
+    // (v | v - 1) is >= 128 (highest bit 1) for v == 0 or v >= 128.
+    unsigned char res = b0 | b0 - 1 |
+                        b1 | b1 - 1 |
+                        b2 | b2 - 1 |
+                        b3 | b3 - 1;
+    if (res >= 128) break;
+    i += 4;
+  }
+  for(; i < length; i++) {
+    unsigned short c;
+    // no embedded zeros
+    if (buffer[i] == 0) return false;
+    if(buffer[i] < 128) {
+      continue;
+    }
+    if ((i + 5) < length) { // see if it's legal supplementary character
+      if (UTF8::is_supplementary_character(&buffer[i])) {
+        c = UTF8::get_supplementary_character(&buffer[i]);
+        i += 5;
+        continue;
+      }
+    }
+    switch (buffer[i] >> 4) {
+      default: break;
+      case 0x8: case 0x9: case 0xA: case 0xB: case 0xF:
+        return false;
+      case 0xC: case 0xD:  // 110xxxxx  10xxxxxx
+        c = (buffer[i] & 0x1F) << 6;
+        i++;
+        if ((i < length) && ((buffer[i] & 0xC0) == 0x80)) {
+          c += buffer[i] & 0x3F;
+          if (version_leq_47 || c == 0 || c >= 0x80) {
+            break;
+          }
+        }
+        return false;
+      case 0xE:  // 1110xxxx 10xxxxxx 10xxxxxx
+        c = (buffer[i] & 0xF) << 12;
+        i += 2;
+        if ((i < length) && ((buffer[i-1] & 0xC0) == 0x80) && ((buffer[i] & 0xC0) == 0x80)) {
+          c += ((buffer[i-1] & 0x3F) << 6) + (buffer[i] & 0x3F);
+          if (version_leq_47 || c >= 0x800) {
+            break;
+          }
+        }
+        return false;
+    }  // end of switch
+  } // end of for
+  return true;
+}
+
 //-------------------------------------------------------------------------------------
 
 bool UNICODE::is_latin1(jchar c) {