src/hotspot/share/utilities/utf8.cpp
author coleenp
Sun, 18 Feb 2018 13:32:24 -0500
changeset 49011 a0e246b7403a
parent 47216 71c04702a3d5
child 51823 2a51125b2794
permissions -rw-r--r--
8182847: Copy class should use assert macros Reviewed-by: kbarrett, tschatzl
Ignore whitespace changes - Everywhere: Within whitespace: At end of lines:
1
489c9b5090e2 Initial load
duke
parents:
diff changeset
     1
/*
40901
0c83ed47db08 8164743: Convert TestAsUtf8 to GTest
kzhaldyb
parents: 36508
diff changeset
     2
 * Copyright (c) 1997, 2016, Oracle and/or its affiliates. All rights reserved.
1
489c9b5090e2 Initial load
duke
parents:
diff changeset
     3
 * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
489c9b5090e2 Initial load
duke
parents:
diff changeset
     4
 *
489c9b5090e2 Initial load
duke
parents:
diff changeset
     5
 * This code is free software; you can redistribute it and/or modify it
489c9b5090e2 Initial load
duke
parents:
diff changeset
     6
 * under the terms of the GNU General Public License version 2 only, as
489c9b5090e2 Initial load
duke
parents:
diff changeset
     7
 * published by the Free Software Foundation.
489c9b5090e2 Initial load
duke
parents:
diff changeset
     8
 *
489c9b5090e2 Initial load
duke
parents:
diff changeset
     9
 * This code is distributed in the hope that it will be useful, but WITHOUT
489c9b5090e2 Initial load
duke
parents:
diff changeset
    10
 * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
489c9b5090e2 Initial load
duke
parents:
diff changeset
    11
 * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
489c9b5090e2 Initial load
duke
parents:
diff changeset
    12
 * version 2 for more details (a copy is included in the LICENSE file that
489c9b5090e2 Initial load
duke
parents:
diff changeset
    13
 * accompanied this code).
489c9b5090e2 Initial load
duke
parents:
diff changeset
    14
 *
489c9b5090e2 Initial load
duke
parents:
diff changeset
    15
 * You should have received a copy of the GNU General Public License version
489c9b5090e2 Initial load
duke
parents:
diff changeset
    16
 * 2 along with this work; if not, write to the Free Software Foundation,
489c9b5090e2 Initial load
duke
parents:
diff changeset
    17
 * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
489c9b5090e2 Initial load
duke
parents:
diff changeset
    18
 *
5547
f4b087cbb361 6941466: Oracle rebranding changes for Hotspot repositories
trims
parents: 1
diff changeset
    19
 * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
f4b087cbb361 6941466: Oracle rebranding changes for Hotspot repositories
trims
parents: 1
diff changeset
    20
 * or visit www.oracle.com if you need additional information or have any
f4b087cbb361 6941466: Oracle rebranding changes for Hotspot repositories
trims
parents: 1
diff changeset
    21
 * questions.
1
489c9b5090e2 Initial load
duke
parents:
diff changeset
    22
 *
489c9b5090e2 Initial load
duke
parents:
diff changeset
    23
 */
489c9b5090e2 Initial load
duke
parents:
diff changeset
    24
7397
5b173b4ca846 6989984: Use standard include model for Hospot
stefank
parents: 5547
diff changeset
    25
#include "precompiled.hpp"
5b173b4ca846 6989984: Use standard include model for Hospot
stefank
parents: 5547
diff changeset
    26
#include "utilities/utf8.hpp"
1
489c9b5090e2 Initial load
duke
parents:
diff changeset
    27
489c9b5090e2 Initial load
duke
parents:
diff changeset
    28
// Assume the utf8 string is in legal form and has been
489c9b5090e2 Initial load
duke
parents:
diff changeset
    29
// checked in the class file parser/format checker.
33628
09241459a8b8 8141132: JEP 254: Compact Strings
thartmann
parents: 24237
diff changeset
    30
template<typename T> char* UTF8::next(const char* str, T* value) {
1
489c9b5090e2 Initial load
duke
parents:
diff changeset
    31
  unsigned const char *ptr = (const unsigned char *)str;
489c9b5090e2 Initial load
duke
parents:
diff changeset
    32
  unsigned char ch, ch2, ch3;
489c9b5090e2 Initial load
duke
parents:
diff changeset
    33
  int length = -1;              /* bad length */
489c9b5090e2 Initial load
duke
parents:
diff changeset
    34
  jchar result;
489c9b5090e2 Initial load
duke
parents:
diff changeset
    35
  switch ((ch = ptr[0]) >> 4) {
489c9b5090e2 Initial load
duke
parents:
diff changeset
    36
    default:
489c9b5090e2 Initial load
duke
parents:
diff changeset
    37
    result = ch;
489c9b5090e2 Initial load
duke
parents:
diff changeset
    38
    length = 1;
489c9b5090e2 Initial load
duke
parents:
diff changeset
    39
    break;
489c9b5090e2 Initial load
duke
parents:
diff changeset
    40
489c9b5090e2 Initial load
duke
parents:
diff changeset
    41
  case 0x8: case 0x9: case 0xA: case 0xB: case 0xF:
489c9b5090e2 Initial load
duke
parents:
diff changeset
    42
    /* Shouldn't happen. */
489c9b5090e2 Initial load
duke
parents:
diff changeset
    43
    break;
489c9b5090e2 Initial load
duke
parents:
diff changeset
    44
489c9b5090e2 Initial load
duke
parents:
diff changeset
    45
  case 0xC: case 0xD:
489c9b5090e2 Initial load
duke
parents:
diff changeset
    46
    /* 110xxxxx  10xxxxxx */
489c9b5090e2 Initial load
duke
parents:
diff changeset
    47
    if (((ch2 = ptr[1]) & 0xC0) == 0x80) {
489c9b5090e2 Initial load
duke
parents:
diff changeset
    48
      unsigned char high_five = ch & 0x1F;
489c9b5090e2 Initial load
duke
parents:
diff changeset
    49
      unsigned char low_six = ch2 & 0x3F;
489c9b5090e2 Initial load
duke
parents:
diff changeset
    50
      result = (high_five << 6) + low_six;
489c9b5090e2 Initial load
duke
parents:
diff changeset
    51
      length = 2;
489c9b5090e2 Initial load
duke
parents:
diff changeset
    52
      break;
489c9b5090e2 Initial load
duke
parents:
diff changeset
    53
    }
489c9b5090e2 Initial load
duke
parents:
diff changeset
    54
    break;
489c9b5090e2 Initial load
duke
parents:
diff changeset
    55
489c9b5090e2 Initial load
duke
parents:
diff changeset
    56
  case 0xE:
489c9b5090e2 Initial load
duke
parents:
diff changeset
    57
    /* 1110xxxx 10xxxxxx 10xxxxxx */
489c9b5090e2 Initial load
duke
parents:
diff changeset
    58
    if (((ch2 = ptr[1]) & 0xC0) == 0x80) {
489c9b5090e2 Initial load
duke
parents:
diff changeset
    59
      if (((ch3 = ptr[2]) & 0xC0) == 0x80) {
489c9b5090e2 Initial load
duke
parents:
diff changeset
    60
        unsigned char high_four = ch & 0x0f;
489c9b5090e2 Initial load
duke
parents:
diff changeset
    61
        unsigned char mid_six = ch2 & 0x3f;
489c9b5090e2 Initial load
duke
parents:
diff changeset
    62
        unsigned char low_six = ch3 & 0x3f;
489c9b5090e2 Initial load
duke
parents:
diff changeset
    63
        result = (((high_four << 6) + mid_six) << 6) + low_six;
489c9b5090e2 Initial load
duke
parents:
diff changeset
    64
        length = 3;
489c9b5090e2 Initial load
duke
parents:
diff changeset
    65
      }
489c9b5090e2 Initial load
duke
parents:
diff changeset
    66
    }
489c9b5090e2 Initial load
duke
parents:
diff changeset
    67
    break;
489c9b5090e2 Initial load
duke
parents:
diff changeset
    68
  } /* end of switch */
489c9b5090e2 Initial load
duke
parents:
diff changeset
    69
489c9b5090e2 Initial load
duke
parents:
diff changeset
    70
  if (length <= 0) {
33628
09241459a8b8 8141132: JEP 254: Compact Strings
thartmann
parents: 24237
diff changeset
    71
    *value = (T)ptr[0];    /* default bad result; */
1
489c9b5090e2 Initial load
duke
parents:
diff changeset
    72
    return (char*)(ptr + 1); // make progress somehow
489c9b5090e2 Initial load
duke
parents:
diff changeset
    73
  }
489c9b5090e2 Initial load
duke
parents:
diff changeset
    74
33628
09241459a8b8 8141132: JEP 254: Compact Strings
thartmann
parents: 24237
diff changeset
    75
  *value = (T)result;
1
489c9b5090e2 Initial load
duke
parents:
diff changeset
    76
489c9b5090e2 Initial load
duke
parents:
diff changeset
    77
  // The assert is correct but the .class file is wrong
489c9b5090e2 Initial load
duke
parents:
diff changeset
    78
  // assert(UNICODE::utf8_size(result) == length, "checking reverse computation");
489c9b5090e2 Initial load
duke
parents:
diff changeset
    79
  return (char *)(ptr + length);
489c9b5090e2 Initial load
duke
parents:
diff changeset
    80
}
489c9b5090e2 Initial load
duke
parents:
diff changeset
    81
489c9b5090e2 Initial load
duke
parents:
diff changeset
    82
char* UTF8::next_character(const char* str, jint* value) {
489c9b5090e2 Initial load
duke
parents:
diff changeset
    83
  unsigned const char *ptr = (const unsigned char *)str;
489c9b5090e2 Initial load
duke
parents:
diff changeset
    84
  /* See if it's legal supplementary character:
489c9b5090e2 Initial load
duke
parents:
diff changeset
    85
     11101101 1010xxxx 10xxxxxx 11101101 1011xxxx 10xxxxxx */
489c9b5090e2 Initial load
duke
parents:
diff changeset
    86
  if (is_supplementary_character(ptr)) {
489c9b5090e2 Initial load
duke
parents:
diff changeset
    87
    *value = get_supplementary_character(ptr);
489c9b5090e2 Initial load
duke
parents:
diff changeset
    88
    return (char *)(ptr + 6);
489c9b5090e2 Initial load
duke
parents:
diff changeset
    89
  }
489c9b5090e2 Initial load
duke
parents:
diff changeset
    90
  jchar result;
489c9b5090e2 Initial load
duke
parents:
diff changeset
    91
  char* next_ch = next(str, &result);
489c9b5090e2 Initial load
duke
parents:
diff changeset
    92
  *value = result;
489c9b5090e2 Initial load
duke
parents:
diff changeset
    93
  return next_ch;
489c9b5090e2 Initial load
duke
parents:
diff changeset
    94
}
489c9b5090e2 Initial load
duke
parents:
diff changeset
    95
489c9b5090e2 Initial load
duke
parents:
diff changeset
    96
// Count bytes of the form 10xxxxxx and deduct this count
489c9b5090e2 Initial load
duke
parents:
diff changeset
    97
// from the total byte count.  The utf8 string must be in
489c9b5090e2 Initial load
duke
parents:
diff changeset
    98
// legal form which has been verified in the format checker.
33628
09241459a8b8 8141132: JEP 254: Compact Strings
thartmann
parents: 24237
diff changeset
    99
int UTF8::unicode_length(const char* str, int len, bool& is_latin1, bool& has_multibyte) {
1
489c9b5090e2 Initial load
duke
parents:
diff changeset
   100
  int num_chars = len;
33628
09241459a8b8 8141132: JEP 254: Compact Strings
thartmann
parents: 24237
diff changeset
   101
  has_multibyte = false;
09241459a8b8 8141132: JEP 254: Compact Strings
thartmann
parents: 24237
diff changeset
   102
  is_latin1 = true;
09241459a8b8 8141132: JEP 254: Compact Strings
thartmann
parents: 24237
diff changeset
   103
  unsigned char prev = 0;
1
489c9b5090e2 Initial load
duke
parents:
diff changeset
   104
  for (int i = 0; i < len; i++) {
33628
09241459a8b8 8141132: JEP 254: Compact Strings
thartmann
parents: 24237
diff changeset
   105
    unsigned char c = str[i];
09241459a8b8 8141132: JEP 254: Compact Strings
thartmann
parents: 24237
diff changeset
   106
    if ((c & 0xC0) == 0x80) {
09241459a8b8 8141132: JEP 254: Compact Strings
thartmann
parents: 24237
diff changeset
   107
      // Multibyte, check if valid latin1 character.
09241459a8b8 8141132: JEP 254: Compact Strings
thartmann
parents: 24237
diff changeset
   108
      has_multibyte = true;
09241459a8b8 8141132: JEP 254: Compact Strings
thartmann
parents: 24237
diff changeset
   109
      if (prev > 0xC3) {
09241459a8b8 8141132: JEP 254: Compact Strings
thartmann
parents: 24237
diff changeset
   110
        is_latin1 = false;
09241459a8b8 8141132: JEP 254: Compact Strings
thartmann
parents: 24237
diff changeset
   111
      }
1
489c9b5090e2 Initial load
duke
parents:
diff changeset
   112
      --num_chars;
489c9b5090e2 Initial load
duke
parents:
diff changeset
   113
    }
33628
09241459a8b8 8141132: JEP 254: Compact Strings
thartmann
parents: 24237
diff changeset
   114
    prev = c;
1
489c9b5090e2 Initial load
duke
parents:
diff changeset
   115
  }
489c9b5090e2 Initial load
duke
parents:
diff changeset
   116
  return num_chars;
489c9b5090e2 Initial load
duke
parents:
diff changeset
   117
}
489c9b5090e2 Initial load
duke
parents:
diff changeset
   118
489c9b5090e2 Initial load
duke
parents:
diff changeset
   119
// Count bytes of the utf8 string except those in form
489c9b5090e2 Initial load
duke
parents:
diff changeset
   120
// 10xxxxxx which only appear in multibyte characters.
489c9b5090e2 Initial load
duke
parents:
diff changeset
   121
// The utf8 string must be in legal form and has been
489c9b5090e2 Initial load
duke
parents:
diff changeset
   122
// verified in the format checker.
33628
09241459a8b8 8141132: JEP 254: Compact Strings
thartmann
parents: 24237
diff changeset
   123
int UTF8::unicode_length(const char* str, bool& is_latin1, bool& has_multibyte) {
1
489c9b5090e2 Initial load
duke
parents:
diff changeset
   124
  int num_chars = 0;
33628
09241459a8b8 8141132: JEP 254: Compact Strings
thartmann
parents: 24237
diff changeset
   125
  has_multibyte = false;
09241459a8b8 8141132: JEP 254: Compact Strings
thartmann
parents: 24237
diff changeset
   126
  is_latin1 = true;
09241459a8b8 8141132: JEP 254: Compact Strings
thartmann
parents: 24237
diff changeset
   127
  unsigned char prev = 0;
1
489c9b5090e2 Initial load
duke
parents:
diff changeset
   128
  for (const char* p = str; *p; p++) {
33628
09241459a8b8 8141132: JEP 254: Compact Strings
thartmann
parents: 24237
diff changeset
   129
    unsigned char c = (*p);
09241459a8b8 8141132: JEP 254: Compact Strings
thartmann
parents: 24237
diff changeset
   130
    if ((c & 0xC0) == 0x80) {
09241459a8b8 8141132: JEP 254: Compact Strings
thartmann
parents: 24237
diff changeset
   131
      // Multibyte, check if valid latin1 character.
09241459a8b8 8141132: JEP 254: Compact Strings
thartmann
parents: 24237
diff changeset
   132
      has_multibyte = true;
09241459a8b8 8141132: JEP 254: Compact Strings
thartmann
parents: 24237
diff changeset
   133
      if (prev > 0xC3) {
09241459a8b8 8141132: JEP 254: Compact Strings
thartmann
parents: 24237
diff changeset
   134
        is_latin1 = false;
09241459a8b8 8141132: JEP 254: Compact Strings
thartmann
parents: 24237
diff changeset
   135
      }
09241459a8b8 8141132: JEP 254: Compact Strings
thartmann
parents: 24237
diff changeset
   136
    } else {
1
489c9b5090e2 Initial load
duke
parents:
diff changeset
   137
      num_chars++;
489c9b5090e2 Initial load
duke
parents:
diff changeset
   138
    }
33628
09241459a8b8 8141132: JEP 254: Compact Strings
thartmann
parents: 24237
diff changeset
   139
    prev = c;
1
489c9b5090e2 Initial load
duke
parents:
diff changeset
   140
  }
489c9b5090e2 Initial load
duke
parents:
diff changeset
   141
  return num_chars;
489c9b5090e2 Initial load
duke
parents:
diff changeset
   142
}
489c9b5090e2 Initial load
duke
parents:
diff changeset
   143
33628
09241459a8b8 8141132: JEP 254: Compact Strings
thartmann
parents: 24237
diff changeset
   144
// Writes a jchar as utf8 and returns the end
1
489c9b5090e2 Initial load
duke
parents:
diff changeset
   145
static u_char* utf8_write(u_char* base, jchar ch) {
489c9b5090e2 Initial load
duke
parents:
diff changeset
   146
  if ((ch != 0) && (ch <=0x7f)) {
489c9b5090e2 Initial load
duke
parents:
diff changeset
   147
    base[0] = (u_char) ch;
489c9b5090e2 Initial load
duke
parents:
diff changeset
   148
    return base + 1;
489c9b5090e2 Initial load
duke
parents:
diff changeset
   149
  }
489c9b5090e2 Initial load
duke
parents:
diff changeset
   150
489c9b5090e2 Initial load
duke
parents:
diff changeset
   151
  if (ch <= 0x7FF) {
489c9b5090e2 Initial load
duke
parents:
diff changeset
   152
    /* 11 bits or less. */
489c9b5090e2 Initial load
duke
parents:
diff changeset
   153
    unsigned char high_five = ch >> 6;
489c9b5090e2 Initial load
duke
parents:
diff changeset
   154
    unsigned char low_six = ch & 0x3F;
489c9b5090e2 Initial load
duke
parents:
diff changeset
   155
    base[0] = high_five | 0xC0; /* 110xxxxx */
489c9b5090e2 Initial load
duke
parents:
diff changeset
   156
    base[1] = low_six | 0x80;   /* 10xxxxxx */
489c9b5090e2 Initial load
duke
parents:
diff changeset
   157
    return base + 2;
489c9b5090e2 Initial load
duke
parents:
diff changeset
   158
  }
489c9b5090e2 Initial load
duke
parents:
diff changeset
   159
  /* possibly full 16 bits. */
489c9b5090e2 Initial load
duke
parents:
diff changeset
   160
  char high_four = ch >> 12;
489c9b5090e2 Initial load
duke
parents:
diff changeset
   161
  char mid_six = (ch >> 6) & 0x3F;
489c9b5090e2 Initial load
duke
parents:
diff changeset
   162
  char low_six = ch & 0x3f;
489c9b5090e2 Initial load
duke
parents:
diff changeset
   163
  base[0] = high_four | 0xE0; /* 1110xxxx */
489c9b5090e2 Initial load
duke
parents:
diff changeset
   164
  base[1] = mid_six | 0x80;   /* 10xxxxxx */
489c9b5090e2 Initial load
duke
parents:
diff changeset
   165
  base[2] = low_six | 0x80;   /* 10xxxxxx */
489c9b5090e2 Initial load
duke
parents:
diff changeset
   166
  return base + 3;
489c9b5090e2 Initial load
duke
parents:
diff changeset
   167
}
489c9b5090e2 Initial load
duke
parents:
diff changeset
   168
33628
09241459a8b8 8141132: JEP 254: Compact Strings
thartmann
parents: 24237
diff changeset
   169
template<typename T> void UTF8::convert_to_unicode(const char* utf8_str, T* unicode_str, int unicode_length) {
1
489c9b5090e2 Initial load
duke
parents:
diff changeset
   170
  unsigned char ch;
14477
95e66ea71f71 6830717: replay of compilations would help with debugging
minqi
parents: 8921
diff changeset
   171
  const char *ptr = utf8_str;
1
489c9b5090e2 Initial load
duke
parents:
diff changeset
   172
  int index = 0;
489c9b5090e2 Initial load
duke
parents:
diff changeset
   173
489c9b5090e2 Initial load
duke
parents:
diff changeset
   174
  /* ASCII case loop optimization */
489c9b5090e2 Initial load
duke
parents:
diff changeset
   175
  for (; index < unicode_length; index++) {
489c9b5090e2 Initial load
duke
parents:
diff changeset
   176
    if((ch = ptr[0]) > 0x7F) { break; }
33628
09241459a8b8 8141132: JEP 254: Compact Strings
thartmann
parents: 24237
diff changeset
   177
    unicode_str[index] = (T)ch;
1
489c9b5090e2 Initial load
duke
parents:
diff changeset
   178
    ptr = (const char *)(ptr + 1);
489c9b5090e2 Initial load
duke
parents:
diff changeset
   179
  }
489c9b5090e2 Initial load
duke
parents:
diff changeset
   180
489c9b5090e2 Initial load
duke
parents:
diff changeset
   181
  for (; index < unicode_length; index++) {
489c9b5090e2 Initial load
duke
parents:
diff changeset
   182
    ptr = UTF8::next(ptr, &unicode_str[index]);
489c9b5090e2 Initial load
duke
parents:
diff changeset
   183
  }
489c9b5090e2 Initial load
duke
parents:
diff changeset
   184
}
489c9b5090e2 Initial load
duke
parents:
diff changeset
   185
33628
09241459a8b8 8141132: JEP 254: Compact Strings
thartmann
parents: 24237
diff changeset
   186
// Explicit instantiation for all supported string types.
09241459a8b8 8141132: JEP 254: Compact Strings
thartmann
parents: 24237
diff changeset
   187
template char* UTF8::next<jchar>(const char* str, jchar* value);
09241459a8b8 8141132: JEP 254: Compact Strings
thartmann
parents: 24237
diff changeset
   188
template char* UTF8::next<jbyte>(const char* str, jbyte* value);
09241459a8b8 8141132: JEP 254: Compact Strings
thartmann
parents: 24237
diff changeset
   189
template void UTF8::convert_to_unicode<jchar>(const char* utf8_str, jchar* unicode_str, int unicode_length);
09241459a8b8 8141132: JEP 254: Compact Strings
thartmann
parents: 24237
diff changeset
   190
template void UTF8::convert_to_unicode<jbyte>(const char* utf8_str, jbyte* unicode_str, int unicode_length);
09241459a8b8 8141132: JEP 254: Compact Strings
thartmann
parents: 24237
diff changeset
   191
14477
95e66ea71f71 6830717: replay of compilations would help with debugging
minqi
parents: 8921
diff changeset
   192
// returns the quoted ascii length of a 0-terminated utf8 string
95e66ea71f71 6830717: replay of compilations would help with debugging
minqi
parents: 8921
diff changeset
   193
int UTF8::quoted_ascii_length(const char* utf8_str, int utf8_length) {
95e66ea71f71 6830717: replay of compilations would help with debugging
minqi
parents: 8921
diff changeset
   194
  const char *ptr = utf8_str;
95e66ea71f71 6830717: replay of compilations would help with debugging
minqi
parents: 8921
diff changeset
   195
  const char* end = ptr + utf8_length;
95e66ea71f71 6830717: replay of compilations would help with debugging
minqi
parents: 8921
diff changeset
   196
  int result = 0;
95e66ea71f71 6830717: replay of compilations would help with debugging
minqi
parents: 8921
diff changeset
   197
  while (ptr < end) {
95e66ea71f71 6830717: replay of compilations would help with debugging
minqi
parents: 8921
diff changeset
   198
    jchar c;
95e66ea71f71 6830717: replay of compilations would help with debugging
minqi
parents: 8921
diff changeset
   199
    ptr = UTF8::next(ptr, &c);
95e66ea71f71 6830717: replay of compilations would help with debugging
minqi
parents: 8921
diff changeset
   200
    if (c >= 32 && c < 127) {
95e66ea71f71 6830717: replay of compilations would help with debugging
minqi
parents: 8921
diff changeset
   201
      result++;
95e66ea71f71 6830717: replay of compilations would help with debugging
minqi
parents: 8921
diff changeset
   202
    } else {
95e66ea71f71 6830717: replay of compilations would help with debugging
minqi
parents: 8921
diff changeset
   203
      result += 6;
95e66ea71f71 6830717: replay of compilations would help with debugging
minqi
parents: 8921
diff changeset
   204
    }
95e66ea71f71 6830717: replay of compilations would help with debugging
minqi
parents: 8921
diff changeset
   205
  }
95e66ea71f71 6830717: replay of compilations would help with debugging
minqi
parents: 8921
diff changeset
   206
  return result;
95e66ea71f71 6830717: replay of compilations would help with debugging
minqi
parents: 8921
diff changeset
   207
}
95e66ea71f71 6830717: replay of compilations would help with debugging
minqi
parents: 8921
diff changeset
   208
95e66ea71f71 6830717: replay of compilations would help with debugging
minqi
parents: 8921
diff changeset
   209
// converts a utf8 string to quoted ascii
16602
5df51d3bc550 8011048: Possible reading from unmapped memory in UTF8::as_quoted_ascii()
iklam
parents: 14477
diff changeset
   210
void UTF8::as_quoted_ascii(const char* utf8_str, int utf8_length, char* buf, int buflen) {
14477
95e66ea71f71 6830717: replay of compilations would help with debugging
minqi
parents: 8921
diff changeset
   211
  const char *ptr = utf8_str;
16602
5df51d3bc550 8011048: Possible reading from unmapped memory in UTF8::as_quoted_ascii()
iklam
parents: 14477
diff changeset
   212
  const char *utf8_end = ptr + utf8_length;
14477
95e66ea71f71 6830717: replay of compilations would help with debugging
minqi
parents: 8921
diff changeset
   213
  char* p = buf;
95e66ea71f71 6830717: replay of compilations would help with debugging
minqi
parents: 8921
diff changeset
   214
  char* end = buf + buflen;
16602
5df51d3bc550 8011048: Possible reading from unmapped memory in UTF8::as_quoted_ascii()
iklam
parents: 14477
diff changeset
   215
  while (ptr < utf8_end) {
14477
95e66ea71f71 6830717: replay of compilations would help with debugging
minqi
parents: 8921
diff changeset
   216
    jchar c;
95e66ea71f71 6830717: replay of compilations would help with debugging
minqi
parents: 8921
diff changeset
   217
    ptr = UTF8::next(ptr, &c);
95e66ea71f71 6830717: replay of compilations would help with debugging
minqi
parents: 8921
diff changeset
   218
    if (c >= 32 && c < 127) {
95e66ea71f71 6830717: replay of compilations would help with debugging
minqi
parents: 8921
diff changeset
   219
      if (p + 1 >= end) break;      // string is truncated
95e66ea71f71 6830717: replay of compilations would help with debugging
minqi
parents: 8921
diff changeset
   220
      *p++ = (char)c;
95e66ea71f71 6830717: replay of compilations would help with debugging
minqi
parents: 8921
diff changeset
   221
    } else {
95e66ea71f71 6830717: replay of compilations would help with debugging
minqi
parents: 8921
diff changeset
   222
      if (p + 6 >= end) break;      // string is truncated
95e66ea71f71 6830717: replay of compilations would help with debugging
minqi
parents: 8921
diff changeset
   223
      sprintf(p, "\\u%04x", c);
95e66ea71f71 6830717: replay of compilations would help with debugging
minqi
parents: 8921
diff changeset
   224
      p += 6;
95e66ea71f71 6830717: replay of compilations would help with debugging
minqi
parents: 8921
diff changeset
   225
    }
95e66ea71f71 6830717: replay of compilations would help with debugging
minqi
parents: 8921
diff changeset
   226
  }
16602
5df51d3bc550 8011048: Possible reading from unmapped memory in UTF8::as_quoted_ascii()
iklam
parents: 14477
diff changeset
   227
  assert(p < end, "sanity");
14477
95e66ea71f71 6830717: replay of compilations would help with debugging
minqi
parents: 8921
diff changeset
   228
  *p = '\0';
95e66ea71f71 6830717: replay of compilations would help with debugging
minqi
parents: 8921
diff changeset
   229
}
95e66ea71f71 6830717: replay of compilations would help with debugging
minqi
parents: 8921
diff changeset
   230
95e66ea71f71 6830717: replay of compilations would help with debugging
minqi
parents: 8921
diff changeset
   231
95e66ea71f71 6830717: replay of compilations would help with debugging
minqi
parents: 8921
diff changeset
   232
const char* UTF8::from_quoted_ascii(const char* quoted_ascii_str) {
95e66ea71f71 6830717: replay of compilations would help with debugging
minqi
parents: 8921
diff changeset
   233
  const char *ptr = quoted_ascii_str;
95e66ea71f71 6830717: replay of compilations would help with debugging
minqi
parents: 8921
diff changeset
   234
  char* result = NULL;
95e66ea71f71 6830717: replay of compilations would help with debugging
minqi
parents: 8921
diff changeset
   235
  while (*ptr != '\0') {
95e66ea71f71 6830717: replay of compilations would help with debugging
minqi
parents: 8921
diff changeset
   236
    char c = *ptr;
95e66ea71f71 6830717: replay of compilations would help with debugging
minqi
parents: 8921
diff changeset
   237
    if (c < 32 || c >= 127) break;
95e66ea71f71 6830717: replay of compilations would help with debugging
minqi
parents: 8921
diff changeset
   238
  }
95e66ea71f71 6830717: replay of compilations would help with debugging
minqi
parents: 8921
diff changeset
   239
  if (*ptr == '\0') {
95e66ea71f71 6830717: replay of compilations would help with debugging
minqi
parents: 8921
diff changeset
   240
    // nothing to do so return original string
95e66ea71f71 6830717: replay of compilations would help with debugging
minqi
parents: 8921
diff changeset
   241
    return quoted_ascii_str;
95e66ea71f71 6830717: replay of compilations would help with debugging
minqi
parents: 8921
diff changeset
   242
  }
95e66ea71f71 6830717: replay of compilations would help with debugging
minqi
parents: 8921
diff changeset
   243
  // everything up to this point was ok.
95e66ea71f71 6830717: replay of compilations would help with debugging
minqi
parents: 8921
diff changeset
   244
  int length = ptr - quoted_ascii_str;
95e66ea71f71 6830717: replay of compilations would help with debugging
minqi
parents: 8921
diff changeset
   245
  char* buffer = NULL;
95e66ea71f71 6830717: replay of compilations would help with debugging
minqi
parents: 8921
diff changeset
   246
  for (int round = 0; round < 2; round++) {
95e66ea71f71 6830717: replay of compilations would help with debugging
minqi
parents: 8921
diff changeset
   247
    while (*ptr != '\0') {
95e66ea71f71 6830717: replay of compilations would help with debugging
minqi
parents: 8921
diff changeset
   248
      if (*ptr != '\\') {
95e66ea71f71 6830717: replay of compilations would help with debugging
minqi
parents: 8921
diff changeset
   249
        if (buffer != NULL) {
95e66ea71f71 6830717: replay of compilations would help with debugging
minqi
parents: 8921
diff changeset
   250
          buffer[length] = *ptr;
95e66ea71f71 6830717: replay of compilations would help with debugging
minqi
parents: 8921
diff changeset
   251
        }
95e66ea71f71 6830717: replay of compilations would help with debugging
minqi
parents: 8921
diff changeset
   252
        length++;
95e66ea71f71 6830717: replay of compilations would help with debugging
minqi
parents: 8921
diff changeset
   253
      } else {
95e66ea71f71 6830717: replay of compilations would help with debugging
minqi
parents: 8921
diff changeset
   254
        switch (ptr[1]) {
95e66ea71f71 6830717: replay of compilations would help with debugging
minqi
parents: 8921
diff changeset
   255
          case 'u': {
95e66ea71f71 6830717: replay of compilations would help with debugging
minqi
parents: 8921
diff changeset
   256
            ptr += 2;
95e66ea71f71 6830717: replay of compilations would help with debugging
minqi
parents: 8921
diff changeset
   257
            jchar value=0;
95e66ea71f71 6830717: replay of compilations would help with debugging
minqi
parents: 8921
diff changeset
   258
            for (int i=0; i<4; i++) {
95e66ea71f71 6830717: replay of compilations would help with debugging
minqi
parents: 8921
diff changeset
   259
              char c = *ptr++;
95e66ea71f71 6830717: replay of compilations would help with debugging
minqi
parents: 8921
diff changeset
   260
              switch (c) {
95e66ea71f71 6830717: replay of compilations would help with debugging
minqi
parents: 8921
diff changeset
   261
                case '0': case '1': case '2': case '3': case '4':
95e66ea71f71 6830717: replay of compilations would help with debugging
minqi
parents: 8921
diff changeset
   262
                case '5': case '6': case '7': case '8': case '9':
95e66ea71f71 6830717: replay of compilations would help with debugging
minqi
parents: 8921
diff changeset
   263
                  value = (value << 4) + c - '0';
95e66ea71f71 6830717: replay of compilations would help with debugging
minqi
parents: 8921
diff changeset
   264
                  break;
95e66ea71f71 6830717: replay of compilations would help with debugging
minqi
parents: 8921
diff changeset
   265
                case 'a': case 'b': case 'c':
95e66ea71f71 6830717: replay of compilations would help with debugging
minqi
parents: 8921
diff changeset
   266
                case 'd': case 'e': case 'f':
95e66ea71f71 6830717: replay of compilations would help with debugging
minqi
parents: 8921
diff changeset
   267
                  value = (value << 4) + 10 + c - 'a';
95e66ea71f71 6830717: replay of compilations would help with debugging
minqi
parents: 8921
diff changeset
   268
                  break;
95e66ea71f71 6830717: replay of compilations would help with debugging
minqi
parents: 8921
diff changeset
   269
                case 'A': case 'B': case 'C':
95e66ea71f71 6830717: replay of compilations would help with debugging
minqi
parents: 8921
diff changeset
   270
                case 'D': case 'E': case 'F':
95e66ea71f71 6830717: replay of compilations would help with debugging
minqi
parents: 8921
diff changeset
   271
                  value = (value << 4) + 10 + c - 'A';
95e66ea71f71 6830717: replay of compilations would help with debugging
minqi
parents: 8921
diff changeset
   272
                  break;
95e66ea71f71 6830717: replay of compilations would help with debugging
minqi
parents: 8921
diff changeset
   273
                default:
95e66ea71f71 6830717: replay of compilations would help with debugging
minqi
parents: 8921
diff changeset
   274
                  ShouldNotReachHere();
95e66ea71f71 6830717: replay of compilations would help with debugging
minqi
parents: 8921
diff changeset
   275
              }
95e66ea71f71 6830717: replay of compilations would help with debugging
minqi
parents: 8921
diff changeset
   276
            }
95e66ea71f71 6830717: replay of compilations would help with debugging
minqi
parents: 8921
diff changeset
   277
            if (buffer == NULL) {
95e66ea71f71 6830717: replay of compilations would help with debugging
minqi
parents: 8921
diff changeset
   278
              char utf8_buffer[4];
95e66ea71f71 6830717: replay of compilations would help with debugging
minqi
parents: 8921
diff changeset
   279
              char* next = (char*)utf8_write((u_char*)utf8_buffer, value);
95e66ea71f71 6830717: replay of compilations would help with debugging
minqi
parents: 8921
diff changeset
   280
              length += next - utf8_buffer;
95e66ea71f71 6830717: replay of compilations would help with debugging
minqi
parents: 8921
diff changeset
   281
            } else {
95e66ea71f71 6830717: replay of compilations would help with debugging
minqi
parents: 8921
diff changeset
   282
              char* next = (char*)utf8_write((u_char*)&buffer[length], value);
95e66ea71f71 6830717: replay of compilations would help with debugging
minqi
parents: 8921
diff changeset
   283
              length += next - &buffer[length];
95e66ea71f71 6830717: replay of compilations would help with debugging
minqi
parents: 8921
diff changeset
   284
            }
95e66ea71f71 6830717: replay of compilations would help with debugging
minqi
parents: 8921
diff changeset
   285
            break;
95e66ea71f71 6830717: replay of compilations would help with debugging
minqi
parents: 8921
diff changeset
   286
          }
95e66ea71f71 6830717: replay of compilations would help with debugging
minqi
parents: 8921
diff changeset
   287
          case 't': if (buffer != NULL) buffer[length] = '\t'; ptr += 2; length++; break;
95e66ea71f71 6830717: replay of compilations would help with debugging
minqi
parents: 8921
diff changeset
   288
          case 'n': if (buffer != NULL) buffer[length] = '\n'; ptr += 2; length++; break;
95e66ea71f71 6830717: replay of compilations would help with debugging
minqi
parents: 8921
diff changeset
   289
          case 'r': if (buffer != NULL) buffer[length] = '\r'; ptr += 2; length++; break;
95e66ea71f71 6830717: replay of compilations would help with debugging
minqi
parents: 8921
diff changeset
   290
          case 'f': if (buffer != NULL) buffer[length] = '\f'; ptr += 2; length++; break;
95e66ea71f71 6830717: replay of compilations would help with debugging
minqi
parents: 8921
diff changeset
   291
          default:
95e66ea71f71 6830717: replay of compilations would help with debugging
minqi
parents: 8921
diff changeset
   292
            ShouldNotReachHere();
95e66ea71f71 6830717: replay of compilations would help with debugging
minqi
parents: 8921
diff changeset
   293
        }
95e66ea71f71 6830717: replay of compilations would help with debugging
minqi
parents: 8921
diff changeset
   294
      }
95e66ea71f71 6830717: replay of compilations would help with debugging
minqi
parents: 8921
diff changeset
   295
    }
95e66ea71f71 6830717: replay of compilations would help with debugging
minqi
parents: 8921
diff changeset
   296
    if (round == 0) {
95e66ea71f71 6830717: replay of compilations would help with debugging
minqi
parents: 8921
diff changeset
   297
      buffer = NEW_RESOURCE_ARRAY(char, length + 1);
95e66ea71f71 6830717: replay of compilations would help with debugging
minqi
parents: 8921
diff changeset
   298
      ptr = quoted_ascii_str;
95e66ea71f71 6830717: replay of compilations would help with debugging
minqi
parents: 8921
diff changeset
   299
    } else {
95e66ea71f71 6830717: replay of compilations would help with debugging
minqi
parents: 8921
diff changeset
   300
      buffer[length] = '\0';
95e66ea71f71 6830717: replay of compilations would help with debugging
minqi
parents: 8921
diff changeset
   301
    }
95e66ea71f71 6830717: replay of compilations would help with debugging
minqi
parents: 8921
diff changeset
   302
  }
95e66ea71f71 6830717: replay of compilations would help with debugging
minqi
parents: 8921
diff changeset
   303
  return buffer;
95e66ea71f71 6830717: replay of compilations would help with debugging
minqi
parents: 8921
diff changeset
   304
}
95e66ea71f71 6830717: replay of compilations would help with debugging
minqi
parents: 8921
diff changeset
   305
95e66ea71f71 6830717: replay of compilations would help with debugging
minqi
parents: 8921
diff changeset
   306
1
489c9b5090e2 Initial load
duke
parents:
diff changeset
   307
// Returns NULL if 'c' it not found. This only works as long
489c9b5090e2 Initial load
duke
parents:
diff changeset
   308
// as 'c' is an ASCII character
8076
96d498ec7ae1 6990754: Use native memory and reference counting to implement SymbolTable
coleenp
parents: 7397
diff changeset
   309
const jbyte* UTF8::strrchr(const jbyte* base, int length, jbyte c) {
1
489c9b5090e2 Initial load
duke
parents:
diff changeset
   310
  assert(length >= 0, "sanity check");
489c9b5090e2 Initial load
duke
parents:
diff changeset
   311
  assert(c >= 0, "does not work for non-ASCII characters");
489c9b5090e2 Initial load
duke
parents:
diff changeset
   312
  // Skip backwards in string until 'c' is found or end is reached
489c9b5090e2 Initial load
duke
parents:
diff changeset
   313
  while(--length >= 0 && base[length] != c);
489c9b5090e2 Initial load
duke
parents:
diff changeset
   314
  return (length < 0) ? NULL : &base[length];
489c9b5090e2 Initial load
duke
parents:
diff changeset
   315
}
489c9b5090e2 Initial load
duke
parents:
diff changeset
   316
8076
96d498ec7ae1 6990754: Use native memory and reference counting to implement SymbolTable
coleenp
parents: 7397
diff changeset
   317
bool UTF8::equal(const jbyte* base1, int length1, const jbyte* base2, int length2) {
1
489c9b5090e2 Initial load
duke
parents:
diff changeset
   318
  // Length must be the same
489c9b5090e2 Initial load
duke
parents:
diff changeset
   319
  if (length1 != length2) return false;
489c9b5090e2 Initial load
duke
parents:
diff changeset
   320
  for (int i = 0; i < length1; i++) {
489c9b5090e2 Initial load
duke
parents:
diff changeset
   321
    if (base1[i] != base2[i]) return false;
489c9b5090e2 Initial load
duke
parents:
diff changeset
   322
  }
489c9b5090e2 Initial load
duke
parents:
diff changeset
   323
  return true;
489c9b5090e2 Initial load
duke
parents:
diff changeset
   324
}
489c9b5090e2 Initial load
duke
parents:
diff changeset
   325
489c9b5090e2 Initial load
duke
parents:
diff changeset
   326
bool UTF8::is_supplementary_character(const unsigned char* str) {
489c9b5090e2 Initial load
duke
parents:
diff changeset
   327
  return ((str[0] & 0xFF) == 0xED) && ((str[1] & 0xF0) == 0xA0) && ((str[2] & 0xC0) == 0x80)
489c9b5090e2 Initial load
duke
parents:
diff changeset
   328
      && ((str[3] & 0xFF) == 0xED) && ((str[4] & 0xF0) == 0xB0) && ((str[5] & 0xC0) == 0x80);
489c9b5090e2 Initial load
duke
parents:
diff changeset
   329
}
489c9b5090e2 Initial load
duke
parents:
diff changeset
   330
489c9b5090e2 Initial load
duke
parents:
diff changeset
   331
jint UTF8::get_supplementary_character(const unsigned char* str) {
489c9b5090e2 Initial load
duke
parents:
diff changeset
   332
  return 0x10000 + ((str[1] & 0x0f) << 16) + ((str[2] & 0x3f) << 10)
489c9b5090e2 Initial load
duke
parents:
diff changeset
   333
                 + ((str[4] & 0x0f) << 6)  + (str[5] & 0x3f);
489c9b5090e2 Initial load
duke
parents:
diff changeset
   334
}
489c9b5090e2 Initial load
duke
parents:
diff changeset
   335
36508
5f9eee6b383b 8142968: Module System implementation
alanb
parents: 33628
diff changeset
   336
bool UTF8::is_legal_utf8(const unsigned char* buffer, int length,
5f9eee6b383b 8142968: Module System implementation
alanb
parents: 33628
diff changeset
   337
                         bool version_leq_47) {
5f9eee6b383b 8142968: Module System implementation
alanb
parents: 33628
diff changeset
   338
  int i = 0;
5f9eee6b383b 8142968: Module System implementation
alanb
parents: 33628
diff changeset
   339
  int count = length >> 2;
5f9eee6b383b 8142968: Module System implementation
alanb
parents: 33628
diff changeset
   340
  for (int k=0; k<count; k++) {
5f9eee6b383b 8142968: Module System implementation
alanb
parents: 33628
diff changeset
   341
    unsigned char b0 = buffer[i];
5f9eee6b383b 8142968: Module System implementation
alanb
parents: 33628
diff changeset
   342
    unsigned char b1 = buffer[i+1];
5f9eee6b383b 8142968: Module System implementation
alanb
parents: 33628
diff changeset
   343
    unsigned char b2 = buffer[i+2];
5f9eee6b383b 8142968: Module System implementation
alanb
parents: 33628
diff changeset
   344
    unsigned char b3 = buffer[i+3];
5f9eee6b383b 8142968: Module System implementation
alanb
parents: 33628
diff changeset
   345
    // For an unsigned char v,
5f9eee6b383b 8142968: Module System implementation
alanb
parents: 33628
diff changeset
   346
    // (v | v - 1) is < 128 (highest bit 0) for 0 < v < 128;
5f9eee6b383b 8142968: Module System implementation
alanb
parents: 33628
diff changeset
   347
    // (v | v - 1) is >= 128 (highest bit 1) for v == 0 or v >= 128.
5f9eee6b383b 8142968: Module System implementation
alanb
parents: 33628
diff changeset
   348
    unsigned char res = b0 | b0 - 1 |
5f9eee6b383b 8142968: Module System implementation
alanb
parents: 33628
diff changeset
   349
                        b1 | b1 - 1 |
5f9eee6b383b 8142968: Module System implementation
alanb
parents: 33628
diff changeset
   350
                        b2 | b2 - 1 |
5f9eee6b383b 8142968: Module System implementation
alanb
parents: 33628
diff changeset
   351
                        b3 | b3 - 1;
5f9eee6b383b 8142968: Module System implementation
alanb
parents: 33628
diff changeset
   352
    if (res >= 128) break;
5f9eee6b383b 8142968: Module System implementation
alanb
parents: 33628
diff changeset
   353
    i += 4;
5f9eee6b383b 8142968: Module System implementation
alanb
parents: 33628
diff changeset
   354
  }
5f9eee6b383b 8142968: Module System implementation
alanb
parents: 33628
diff changeset
   355
  for(; i < length; i++) {
5f9eee6b383b 8142968: Module System implementation
alanb
parents: 33628
diff changeset
   356
    unsigned short c;
5f9eee6b383b 8142968: Module System implementation
alanb
parents: 33628
diff changeset
   357
    // no embedded zeros
5f9eee6b383b 8142968: Module System implementation
alanb
parents: 33628
diff changeset
   358
    if (buffer[i] == 0) return false;
5f9eee6b383b 8142968: Module System implementation
alanb
parents: 33628
diff changeset
   359
    if(buffer[i] < 128) {
5f9eee6b383b 8142968: Module System implementation
alanb
parents: 33628
diff changeset
   360
      continue;
5f9eee6b383b 8142968: Module System implementation
alanb
parents: 33628
diff changeset
   361
    }
5f9eee6b383b 8142968: Module System implementation
alanb
parents: 33628
diff changeset
   362
    if ((i + 5) < length) { // see if it's legal supplementary character
5f9eee6b383b 8142968: Module System implementation
alanb
parents: 33628
diff changeset
   363
      if (UTF8::is_supplementary_character(&buffer[i])) {
5f9eee6b383b 8142968: Module System implementation
alanb
parents: 33628
diff changeset
   364
        c = UTF8::get_supplementary_character(&buffer[i]);
5f9eee6b383b 8142968: Module System implementation
alanb
parents: 33628
diff changeset
   365
        i += 5;
5f9eee6b383b 8142968: Module System implementation
alanb
parents: 33628
diff changeset
   366
        continue;
5f9eee6b383b 8142968: Module System implementation
alanb
parents: 33628
diff changeset
   367
      }
5f9eee6b383b 8142968: Module System implementation
alanb
parents: 33628
diff changeset
   368
    }
5f9eee6b383b 8142968: Module System implementation
alanb
parents: 33628
diff changeset
   369
    switch (buffer[i] >> 4) {
5f9eee6b383b 8142968: Module System implementation
alanb
parents: 33628
diff changeset
   370
      default: break;
5f9eee6b383b 8142968: Module System implementation
alanb
parents: 33628
diff changeset
   371
      case 0x8: case 0x9: case 0xA: case 0xB: case 0xF:
5f9eee6b383b 8142968: Module System implementation
alanb
parents: 33628
diff changeset
   372
        return false;
5f9eee6b383b 8142968: Module System implementation
alanb
parents: 33628
diff changeset
   373
      case 0xC: case 0xD:  // 110xxxxx  10xxxxxx
5f9eee6b383b 8142968: Module System implementation
alanb
parents: 33628
diff changeset
   374
        c = (buffer[i] & 0x1F) << 6;
5f9eee6b383b 8142968: Module System implementation
alanb
parents: 33628
diff changeset
   375
        i++;
5f9eee6b383b 8142968: Module System implementation
alanb
parents: 33628
diff changeset
   376
        if ((i < length) && ((buffer[i] & 0xC0) == 0x80)) {
5f9eee6b383b 8142968: Module System implementation
alanb
parents: 33628
diff changeset
   377
          c += buffer[i] & 0x3F;
5f9eee6b383b 8142968: Module System implementation
alanb
parents: 33628
diff changeset
   378
          if (version_leq_47 || c == 0 || c >= 0x80) {
5f9eee6b383b 8142968: Module System implementation
alanb
parents: 33628
diff changeset
   379
            break;
5f9eee6b383b 8142968: Module System implementation
alanb
parents: 33628
diff changeset
   380
          }
5f9eee6b383b 8142968: Module System implementation
alanb
parents: 33628
diff changeset
   381
        }
5f9eee6b383b 8142968: Module System implementation
alanb
parents: 33628
diff changeset
   382
        return false;
5f9eee6b383b 8142968: Module System implementation
alanb
parents: 33628
diff changeset
   383
      case 0xE:  // 1110xxxx 10xxxxxx 10xxxxxx
5f9eee6b383b 8142968: Module System implementation
alanb
parents: 33628
diff changeset
   384
        c = (buffer[i] & 0xF) << 12;
5f9eee6b383b 8142968: Module System implementation
alanb
parents: 33628
diff changeset
   385
        i += 2;
5f9eee6b383b 8142968: Module System implementation
alanb
parents: 33628
diff changeset
   386
        if ((i < length) && ((buffer[i-1] & 0xC0) == 0x80) && ((buffer[i] & 0xC0) == 0x80)) {
5f9eee6b383b 8142968: Module System implementation
alanb
parents: 33628
diff changeset
   387
          c += ((buffer[i-1] & 0x3F) << 6) + (buffer[i] & 0x3F);
5f9eee6b383b 8142968: Module System implementation
alanb
parents: 33628
diff changeset
   388
          if (version_leq_47 || c >= 0x800) {
5f9eee6b383b 8142968: Module System implementation
alanb
parents: 33628
diff changeset
   389
            break;
5f9eee6b383b 8142968: Module System implementation
alanb
parents: 33628
diff changeset
   390
          }
5f9eee6b383b 8142968: Module System implementation
alanb
parents: 33628
diff changeset
   391
        }
5f9eee6b383b 8142968: Module System implementation
alanb
parents: 33628
diff changeset
   392
        return false;
5f9eee6b383b 8142968: Module System implementation
alanb
parents: 33628
diff changeset
   393
    }  // end of switch
5f9eee6b383b 8142968: Module System implementation
alanb
parents: 33628
diff changeset
   394
  } // end of for
5f9eee6b383b 8142968: Module System implementation
alanb
parents: 33628
diff changeset
   395
  return true;
5f9eee6b383b 8142968: Module System implementation
alanb
parents: 33628
diff changeset
   396
}
5f9eee6b383b 8142968: Module System implementation
alanb
parents: 33628
diff changeset
   397
1
489c9b5090e2 Initial load
duke
parents:
diff changeset
   398
//-------------------------------------------------------------------------------------
489c9b5090e2 Initial load
duke
parents:
diff changeset
   399
33628
09241459a8b8 8141132: JEP 254: Compact Strings
thartmann
parents: 24237
diff changeset
   400
bool UNICODE::is_latin1(jchar c) {
09241459a8b8 8141132: JEP 254: Compact Strings
thartmann
parents: 24237
diff changeset
   401
  return (c <= 0x00FF);
09241459a8b8 8141132: JEP 254: Compact Strings
thartmann
parents: 24237
diff changeset
   402
}
09241459a8b8 8141132: JEP 254: Compact Strings
thartmann
parents: 24237
diff changeset
   403
09241459a8b8 8141132: JEP 254: Compact Strings
thartmann
parents: 24237
diff changeset
   404
bool UNICODE::is_latin1(jchar* base, int length) {
09241459a8b8 8141132: JEP 254: Compact Strings
thartmann
parents: 24237
diff changeset
   405
  for (int index = 0; index < length; index++) {
09241459a8b8 8141132: JEP 254: Compact Strings
thartmann
parents: 24237
diff changeset
   406
    if (base[index] > 0x00FF) {
09241459a8b8 8141132: JEP 254: Compact Strings
thartmann
parents: 24237
diff changeset
   407
      return false;
09241459a8b8 8141132: JEP 254: Compact Strings
thartmann
parents: 24237
diff changeset
   408
    }
09241459a8b8 8141132: JEP 254: Compact Strings
thartmann
parents: 24237
diff changeset
   409
  }
09241459a8b8 8141132: JEP 254: Compact Strings
thartmann
parents: 24237
diff changeset
   410
  return true;
09241459a8b8 8141132: JEP 254: Compact Strings
thartmann
parents: 24237
diff changeset
   411
}
1
489c9b5090e2 Initial load
duke
parents:
diff changeset
   412
489c9b5090e2 Initial load
duke
parents:
diff changeset
   413
int UNICODE::utf8_size(jchar c) {
42057
6a5b8ebcd3f2 8164612: NoSuchMethodException when method name contains NULL or Latin-1 supplement character
thartmann
parents: 40901
diff changeset
   414
  if ((0x0001 <= c) && (c <= 0x007F)) {
6a5b8ebcd3f2 8164612: NoSuchMethodException when method name contains NULL or Latin-1 supplement character
thartmann
parents: 40901
diff changeset
   415
    // ASCII character
6a5b8ebcd3f2 8164612: NoSuchMethodException when method name contains NULL or Latin-1 supplement character
thartmann
parents: 40901
diff changeset
   416
    return 1;
6a5b8ebcd3f2 8164612: NoSuchMethodException when method name contains NULL or Latin-1 supplement character
thartmann
parents: 40901
diff changeset
   417
  } else  if (c <= 0x07FF) {
6a5b8ebcd3f2 8164612: NoSuchMethodException when method name contains NULL or Latin-1 supplement character
thartmann
parents: 40901
diff changeset
   418
    return 2;
6a5b8ebcd3f2 8164612: NoSuchMethodException when method name contains NULL or Latin-1 supplement character
thartmann
parents: 40901
diff changeset
   419
  } else {
6a5b8ebcd3f2 8164612: NoSuchMethodException when method name contains NULL or Latin-1 supplement character
thartmann
parents: 40901
diff changeset
   420
    return 3;
6a5b8ebcd3f2 8164612: NoSuchMethodException when method name contains NULL or Latin-1 supplement character
thartmann
parents: 40901
diff changeset
   421
  }
1
489c9b5090e2 Initial load
duke
parents:
diff changeset
   422
}
489c9b5090e2 Initial load
duke
parents:
diff changeset
   423
33628
09241459a8b8 8141132: JEP 254: Compact Strings
thartmann
parents: 24237
diff changeset
   424
int UNICODE::utf8_size(jbyte c) {
42057
6a5b8ebcd3f2 8164612: NoSuchMethodException when method name contains NULL or Latin-1 supplement character
thartmann
parents: 40901
diff changeset
   425
  if (c >= 0x01) {
6a5b8ebcd3f2 8164612: NoSuchMethodException when method name contains NULL or Latin-1 supplement character
thartmann
parents: 40901
diff changeset
   426
    // ASCII character. Check is equivalent to
6a5b8ebcd3f2 8164612: NoSuchMethodException when method name contains NULL or Latin-1 supplement character
thartmann
parents: 40901
diff changeset
   427
    // (0x01 <= c) && (c <= 0x7F) because c is signed.
6a5b8ebcd3f2 8164612: NoSuchMethodException when method name contains NULL or Latin-1 supplement character
thartmann
parents: 40901
diff changeset
   428
    return 1;
6a5b8ebcd3f2 8164612: NoSuchMethodException when method name contains NULL or Latin-1 supplement character
thartmann
parents: 40901
diff changeset
   429
  } else {
6a5b8ebcd3f2 8164612: NoSuchMethodException when method name contains NULL or Latin-1 supplement character
thartmann
parents: 40901
diff changeset
   430
    // Non-ASCII character or 0x00 which needs to be
6a5b8ebcd3f2 8164612: NoSuchMethodException when method name contains NULL or Latin-1 supplement character
thartmann
parents: 40901
diff changeset
   431
    // two-byte encoded as 0xC080 in modified UTF-8.
6a5b8ebcd3f2 8164612: NoSuchMethodException when method name contains NULL or Latin-1 supplement character
thartmann
parents: 40901
diff changeset
   432
    return 2;
6a5b8ebcd3f2 8164612: NoSuchMethodException when method name contains NULL or Latin-1 supplement character
thartmann
parents: 40901
diff changeset
   433
  }
33628
09241459a8b8 8141132: JEP 254: Compact Strings
thartmann
parents: 24237
diff changeset
   434
}
09241459a8b8 8141132: JEP 254: Compact Strings
thartmann
parents: 24237
diff changeset
   435
42057
6a5b8ebcd3f2 8164612: NoSuchMethodException when method name contains NULL or Latin-1 supplement character
thartmann
parents: 40901
diff changeset
   436
template<typename T>
6a5b8ebcd3f2 8164612: NoSuchMethodException when method name contains NULL or Latin-1 supplement character
thartmann
parents: 40901
diff changeset
   437
int UNICODE::utf8_length(T* base, int length) {
1
489c9b5090e2 Initial load
duke
parents:
diff changeset
   438
  int result = 0;
489c9b5090e2 Initial load
duke
parents:
diff changeset
   439
  for (int index = 0; index < length; index++) {
42057
6a5b8ebcd3f2 8164612: NoSuchMethodException when method name contains NULL or Latin-1 supplement character
thartmann
parents: 40901
diff changeset
   440
    T c = base[index];
33628
09241459a8b8 8141132: JEP 254: Compact Strings
thartmann
parents: 24237
diff changeset
   441
    result += utf8_size(c);
09241459a8b8 8141132: JEP 254: Compact Strings
thartmann
parents: 24237
diff changeset
   442
  }
09241459a8b8 8141132: JEP 254: Compact Strings
thartmann
parents: 24237
diff changeset
   443
  return result;
09241459a8b8 8141132: JEP 254: Compact Strings
thartmann
parents: 24237
diff changeset
   444
}
09241459a8b8 8141132: JEP 254: Compact Strings
thartmann
parents: 24237
diff changeset
   445
42057
6a5b8ebcd3f2 8164612: NoSuchMethodException when method name contains NULL or Latin-1 supplement character
thartmann
parents: 40901
diff changeset
   446
template<typename T>
6a5b8ebcd3f2 8164612: NoSuchMethodException when method name contains NULL or Latin-1 supplement character
thartmann
parents: 40901
diff changeset
   447
char* UNICODE::as_utf8(T* base, int& length) {
1
489c9b5090e2 Initial load
duke
parents:
diff changeset
   448
  int utf8_len = utf8_length(base, length);
24237
7b210ef8c830 6664815: Eliminate redundant memcpy operation in jni_GetStringUTFRegion
mgerdin
parents: 16602
diff changeset
   449
  u_char* buf = NEW_RESOURCE_ARRAY(u_char, utf8_len + 1);
7b210ef8c830 6664815: Eliminate redundant memcpy operation in jni_GetStringUTFRegion
mgerdin
parents: 16602
diff changeset
   450
  char* result = as_utf8(base, length, (char*) buf, utf8_len + 1);
7b210ef8c830 6664815: Eliminate redundant memcpy operation in jni_GetStringUTFRegion
mgerdin
parents: 16602
diff changeset
   451
  assert((int) strlen(result) == utf8_len, "length prediction must be correct");
42057
6a5b8ebcd3f2 8164612: NoSuchMethodException when method name contains NULL or Latin-1 supplement character
thartmann
parents: 40901
diff changeset
   452
  // Set string length to uft8 length
6a5b8ebcd3f2 8164612: NoSuchMethodException when method name contains NULL or Latin-1 supplement character
thartmann
parents: 40901
diff changeset
   453
  length = utf8_len;
33628
09241459a8b8 8141132: JEP 254: Compact Strings
thartmann
parents: 24237
diff changeset
   454
  return (char*) result;
09241459a8b8 8141132: JEP 254: Compact Strings
thartmann
parents: 24237
diff changeset
   455
}
09241459a8b8 8141132: JEP 254: Compact Strings
thartmann
parents: 24237
diff changeset
   456
1
489c9b5090e2 Initial load
duke
parents:
diff changeset
   457
char* UNICODE::as_utf8(jchar* base, int length, char* buf, int buflen) {
489c9b5090e2 Initial load
duke
parents:
diff changeset
   458
  u_char* p = (u_char*)buf;
489c9b5090e2 Initial load
duke
parents:
diff changeset
   459
  for (int index = 0; index < length; index++) {
489c9b5090e2 Initial load
duke
parents:
diff changeset
   460
    jchar c = base[index];
24237
7b210ef8c830 6664815: Eliminate redundant memcpy operation in jni_GetStringUTFRegion
mgerdin
parents: 16602
diff changeset
   461
    buflen -= utf8_size(c);
7b210ef8c830 6664815: Eliminate redundant memcpy operation in jni_GetStringUTFRegion
mgerdin
parents: 16602
diff changeset
   462
    if (buflen <= 0) break; // string is truncated
7b210ef8c830 6664815: Eliminate redundant memcpy operation in jni_GetStringUTFRegion
mgerdin
parents: 16602
diff changeset
   463
    p = utf8_write(p, c);
1
489c9b5090e2 Initial load
duke
parents:
diff changeset
   464
  }
489c9b5090e2 Initial load
duke
parents:
diff changeset
   465
  *p = '\0';
489c9b5090e2 Initial load
duke
parents:
diff changeset
   466
  return buf;
489c9b5090e2 Initial load
duke
parents:
diff changeset
   467
}
489c9b5090e2 Initial load
duke
parents:
diff changeset
   468
33628
09241459a8b8 8141132: JEP 254: Compact Strings
thartmann
parents: 24237
diff changeset
   469
char* UNICODE::as_utf8(jbyte* base, int length, char* buf, int buflen) {
09241459a8b8 8141132: JEP 254: Compact Strings
thartmann
parents: 24237
diff changeset
   470
  u_char* p = (u_char*)buf;
09241459a8b8 8141132: JEP 254: Compact Strings
thartmann
parents: 24237
diff changeset
   471
  u_char* end = (u_char*)buf + buflen;
09241459a8b8 8141132: JEP 254: Compact Strings
thartmann
parents: 24237
diff changeset
   472
  for (int index = 0; index < length; index++) {
09241459a8b8 8141132: JEP 254: Compact Strings
thartmann
parents: 24237
diff changeset
   473
    jbyte c = base[index];
09241459a8b8 8141132: JEP 254: Compact Strings
thartmann
parents: 24237
diff changeset
   474
    int sz = utf8_size(c);
09241459a8b8 8141132: JEP 254: Compact Strings
thartmann
parents: 24237
diff changeset
   475
    buflen -= sz;
09241459a8b8 8141132: JEP 254: Compact Strings
thartmann
parents: 24237
diff changeset
   476
    if (buflen <= 0) break; // string is truncated
09241459a8b8 8141132: JEP 254: Compact Strings
thartmann
parents: 24237
diff changeset
   477
    if (sz == 1) {
42057
6a5b8ebcd3f2 8164612: NoSuchMethodException when method name contains NULL or Latin-1 supplement character
thartmann
parents: 40901
diff changeset
   478
      // Copy ASCII characters (UTF-8 is ASCII compatible)
33628
09241459a8b8 8141132: JEP 254: Compact Strings
thartmann
parents: 24237
diff changeset
   479
      *p++ = c;
09241459a8b8 8141132: JEP 254: Compact Strings
thartmann
parents: 24237
diff changeset
   480
    } else {
42057
6a5b8ebcd3f2 8164612: NoSuchMethodException when method name contains NULL or Latin-1 supplement character
thartmann
parents: 40901
diff changeset
   481
      // Non-ASCII character or 0x00 which should
33628
09241459a8b8 8141132: JEP 254: Compact Strings
thartmann
parents: 24237
diff changeset
   482
      // be encoded as 0xC080 in "modified" UTF8.
09241459a8b8 8141132: JEP 254: Compact Strings
thartmann
parents: 24237
diff changeset
   483
      p = utf8_write(p, ((jchar) c) & 0xff);
09241459a8b8 8141132: JEP 254: Compact Strings
thartmann
parents: 24237
diff changeset
   484
    }
09241459a8b8 8141132: JEP 254: Compact Strings
thartmann
parents: 24237
diff changeset
   485
  }
09241459a8b8 8141132: JEP 254: Compact Strings
thartmann
parents: 24237
diff changeset
   486
  *p = '\0';
09241459a8b8 8141132: JEP 254: Compact Strings
thartmann
parents: 24237
diff changeset
   487
  return buf;
09241459a8b8 8141132: JEP 254: Compact Strings
thartmann
parents: 24237
diff changeset
   488
}
09241459a8b8 8141132: JEP 254: Compact Strings
thartmann
parents: 24237
diff changeset
   489
1
489c9b5090e2 Initial load
duke
parents:
diff changeset
   490
void UNICODE::convert_to_utf8(const jchar* base, int length, char* utf8_buffer) {
489c9b5090e2 Initial load
duke
parents:
diff changeset
   491
  for(int index = 0; index < length; index++) {
489c9b5090e2 Initial load
duke
parents:
diff changeset
   492
    utf8_buffer = (char*)utf8_write((u_char*)utf8_buffer, base[index]);
489c9b5090e2 Initial load
duke
parents:
diff changeset
   493
  }
489c9b5090e2 Initial load
duke
parents:
diff changeset
   494
  *utf8_buffer = '\0';
489c9b5090e2 Initial load
duke
parents:
diff changeset
   495
}
14477
95e66ea71f71 6830717: replay of compilations would help with debugging
minqi
parents: 8921
diff changeset
   496
95e66ea71f71 6830717: replay of compilations would help with debugging
minqi
parents: 8921
diff changeset
   497
// returns the quoted ascii length of a unicode string
33628
09241459a8b8 8141132: JEP 254: Compact Strings
thartmann
parents: 24237
diff changeset
   498
template<typename T>
09241459a8b8 8141132: JEP 254: Compact Strings
thartmann
parents: 24237
diff changeset
   499
int UNICODE::quoted_ascii_length(T* base, int length) {
14477
95e66ea71f71 6830717: replay of compilations would help with debugging
minqi
parents: 8921
diff changeset
   500
  int result = 0;
95e66ea71f71 6830717: replay of compilations would help with debugging
minqi
parents: 8921
diff changeset
   501
  for (int i = 0; i < length; i++) {
33628
09241459a8b8 8141132: JEP 254: Compact Strings
thartmann
parents: 24237
diff changeset
   502
    T c = base[i];
14477
95e66ea71f71 6830717: replay of compilations would help with debugging
minqi
parents: 8921
diff changeset
   503
    if (c >= 32 && c < 127) {
95e66ea71f71 6830717: replay of compilations would help with debugging
minqi
parents: 8921
diff changeset
   504
      result++;
95e66ea71f71 6830717: replay of compilations would help with debugging
minqi
parents: 8921
diff changeset
   505
    } else {
95e66ea71f71 6830717: replay of compilations would help with debugging
minqi
parents: 8921
diff changeset
   506
      result += 6;
95e66ea71f71 6830717: replay of compilations would help with debugging
minqi
parents: 8921
diff changeset
   507
    }
95e66ea71f71 6830717: replay of compilations would help with debugging
minqi
parents: 8921
diff changeset
   508
  }
95e66ea71f71 6830717: replay of compilations would help with debugging
minqi
parents: 8921
diff changeset
   509
  return result;
95e66ea71f71 6830717: replay of compilations would help with debugging
minqi
parents: 8921
diff changeset
   510
}
95e66ea71f71 6830717: replay of compilations would help with debugging
minqi
parents: 8921
diff changeset
   511
33628
09241459a8b8 8141132: JEP 254: Compact Strings
thartmann
parents: 24237
diff changeset
   512
// converts a unicode string to quoted ascii
09241459a8b8 8141132: JEP 254: Compact Strings
thartmann
parents: 24237
diff changeset
   513
template<typename T>
09241459a8b8 8141132: JEP 254: Compact Strings
thartmann
parents: 24237
diff changeset
   514
void UNICODE::as_quoted_ascii(const T* base, int length, char* buf, int buflen) {
14477
95e66ea71f71 6830717: replay of compilations would help with debugging
minqi
parents: 8921
diff changeset
   515
  char* p = buf;
95e66ea71f71 6830717: replay of compilations would help with debugging
minqi
parents: 8921
diff changeset
   516
  char* end = buf + buflen;
95e66ea71f71 6830717: replay of compilations would help with debugging
minqi
parents: 8921
diff changeset
   517
  for (int index = 0; index < length; index++) {
33628
09241459a8b8 8141132: JEP 254: Compact Strings
thartmann
parents: 24237
diff changeset
   518
    T c = base[index];
14477
95e66ea71f71 6830717: replay of compilations would help with debugging
minqi
parents: 8921
diff changeset
   519
    if (c >= 32 && c < 127) {
95e66ea71f71 6830717: replay of compilations would help with debugging
minqi
parents: 8921
diff changeset
   520
      if (p + 1 >= end) break;      // string is truncated
95e66ea71f71 6830717: replay of compilations would help with debugging
minqi
parents: 8921
diff changeset
   521
      *p++ = (char)c;
95e66ea71f71 6830717: replay of compilations would help with debugging
minqi
parents: 8921
diff changeset
   522
    } else {
95e66ea71f71 6830717: replay of compilations would help with debugging
minqi
parents: 8921
diff changeset
   523
      if (p + 6 >= end) break;      // string is truncated
95e66ea71f71 6830717: replay of compilations would help with debugging
minqi
parents: 8921
diff changeset
   524
      sprintf(p, "\\u%04x", c);
95e66ea71f71 6830717: replay of compilations would help with debugging
minqi
parents: 8921
diff changeset
   525
      p += 6;
95e66ea71f71 6830717: replay of compilations would help with debugging
minqi
parents: 8921
diff changeset
   526
    }
95e66ea71f71 6830717: replay of compilations would help with debugging
minqi
parents: 8921
diff changeset
   527
  }
95e66ea71f71 6830717: replay of compilations would help with debugging
minqi
parents: 8921
diff changeset
   528
  *p = '\0';
95e66ea71f71 6830717: replay of compilations would help with debugging
minqi
parents: 8921
diff changeset
   529
}
24237
7b210ef8c830 6664815: Eliminate redundant memcpy operation in jni_GetStringUTFRegion
mgerdin
parents: 16602
diff changeset
   530
33628
09241459a8b8 8141132: JEP 254: Compact Strings
thartmann
parents: 24237
diff changeset
   531
// Explicit instantiation for all supported types.
42057
6a5b8ebcd3f2 8164612: NoSuchMethodException when method name contains NULL or Latin-1 supplement character
thartmann
parents: 40901
diff changeset
   532
template int UNICODE::utf8_length(jbyte* base, int length);
6a5b8ebcd3f2 8164612: NoSuchMethodException when method name contains NULL or Latin-1 supplement character
thartmann
parents: 40901
diff changeset
   533
template int UNICODE::utf8_length(jchar* base, int length);
6a5b8ebcd3f2 8164612: NoSuchMethodException when method name contains NULL or Latin-1 supplement character
thartmann
parents: 40901
diff changeset
   534
template char* UNICODE::as_utf8(jbyte* base, int& length);
6a5b8ebcd3f2 8164612: NoSuchMethodException when method name contains NULL or Latin-1 supplement character
thartmann
parents: 40901
diff changeset
   535
template char* UNICODE::as_utf8(jchar* base, int& length);
33628
09241459a8b8 8141132: JEP 254: Compact Strings
thartmann
parents: 24237
diff changeset
   536
template int UNICODE::quoted_ascii_length<jbyte>(jbyte* base, int length);
09241459a8b8 8141132: JEP 254: Compact Strings
thartmann
parents: 24237
diff changeset
   537
template int UNICODE::quoted_ascii_length<jchar>(jchar* base, int length);
09241459a8b8 8141132: JEP 254: Compact Strings
thartmann
parents: 24237
diff changeset
   538
template void UNICODE::as_quoted_ascii<jbyte>(const jbyte* base, int length, char* buf, int buflen);
09241459a8b8 8141132: JEP 254: Compact Strings
thartmann
parents: 24237
diff changeset
   539
template void UNICODE::as_quoted_ascii<jchar>(const jchar* base, int length, char* buf, int buflen);