jaxp/src/com/sun/org/apache/xerces/internal/impl/XMLEntityManager.java
changeset 13266 8be9cee3d6dd
parent 13176 2c1fa96ba3d7
child 16953 a44e04deb948
equal deleted inserted replaced
13178:1da54b30d607 13266:8be9cee3d6dd
   600             baseSystemId = expandedSystemId;
   600             baseSystemId = expandedSystemId;
   601         }
   601         }
   602         if (reader == null) {
   602         if (reader == null) {
   603             stream = xmlInputSource.getByteStream();
   603             stream = xmlInputSource.getByteStream();
   604             if (stream == null) {
   604             if (stream == null) {
   605                 URL location = new URL(escapeNonUSAscii(expandedSystemId));
   605                 URL location = new URL(expandedSystemId);
   606                 URLConnection connect = location.openConnection();
   606                 URLConnection connect = location.openConnection();
   607                 if (!(connect instanceof HttpURLConnection)) {
   607                 if (!(connect instanceof HttpURLConnection)) {
   608                     stream = connect.getInputStream();
   608                     stream = connect.getInputStream();
   609                 }
   609                 }
   610                 else {
   610                 else {
  2584         // done
  2584         // done
  2585         return str;
  2585         return str;
  2586 
  2586 
  2587     } // fixURI(String):String
  2587     } // fixURI(String):String
  2588 
  2588 
  2589     /**
       
  2590      * Escape invalid URI characters.
       
  2591      *
       
  2592      * Passed a URI that contains invalid characters (like spaces, non-ASCII Unicode characters, and the like),
       
  2593      * this function percent encodes the invalid characters per the URI specification (i.e., as a sequence of
       
  2594      * %-encoded UTF-8 octets).
       
  2595      *
       
  2596      * N.B. There are two problems. If the URI contains a '%' character, that might be an indication that
       
  2597      * the URI has already been escaped by the author, or it might be an invalid '%'. In the former case,
       
  2598      * it's important not to escape it, or we'll wind up with invalid, doubly-escaped '%'s. In the latter,
       
  2599      * the URI is broken if we don't encode it. Similarly, a '#' character might be the start of a fragment
       
  2600      * identifier or it might be an invalid '#'.
       
  2601      *
       
  2602      * Given that the former is vastly more likely than the latter in each case (most users are familiar with
       
  2603      * the magic status of '%' and '#' and they occur relatively infrequently in filenames, and if the user parses
       
  2604      * a proper Java File, we will already have %-escaped the URI), we simply assume that %'s and #'s are legit.
       
  2605      *
       
  2606      * Very rarely, we may be wrong. If so, tell the user to fix the clearly broken URI.
       
  2607      */
       
  2608     protected static String escapeNonUSAscii(String str) {
       
  2609         if (str == null) {
       
  2610             return str;
       
  2611         }
       
  2612         int len = str.length(), i=0, ch;
       
  2613         for (; i < len; i++) {
       
  2614             ch = str.charAt(i);
       
  2615             // if it's not an ASCII 7 character, break here, and use UTF-8 encoding
       
  2616             if (ch >= 128)
       
  2617                 break;
       
  2618         }
       
  2619 
       
  2620         // we saw no non-ascii-7 character
       
  2621         if (i == len) {
       
  2622             return str;
       
  2623         }
       
  2624 
       
  2625         // get UTF-8 bytes for the string
       
  2626         StringBuffer buffer = new StringBuffer();
       
  2627         byte[] bytes = null;
       
  2628         byte b;
       
  2629         try {
       
  2630             bytes = str.getBytes("UTF-8");
       
  2631         } catch (java.io.UnsupportedEncodingException e) {
       
  2632             // should never happen
       
  2633             return str;
       
  2634         }
       
  2635 
       
  2636         len = bytes.length;
       
  2637 
       
  2638         // for each byte
       
  2639         for (i = 0; i < len; i++) {
       
  2640             b = bytes[i];
       
  2641             // for non-ascii character: make it positive, then escape
       
  2642             if (b < 0) {
       
  2643                 ch = b + 256;
       
  2644                 buffer.append('%');
       
  2645                 buffer.append(gHexChs[ch >> 4]);
       
  2646                 buffer.append(gHexChs[ch & 0xf]);
       
  2647             }
       
  2648             else if (b != '%' && b != '#' && gNeedEscaping[b]) {
       
  2649                 buffer.append('%');
       
  2650                 buffer.append(gAfterEscaping1[b]);
       
  2651                 buffer.append(gAfterEscaping2[b]);
       
  2652             }
       
  2653             else {
       
  2654                 buffer.append((char)b);
       
  2655             }
       
  2656         }
       
  2657         return buffer.toString();
       
  2658     }
       
  2659 
  2589 
  2660     //
  2590     //
  2661     // Package visible methods
  2591     // Package visible methods
  2662     //
  2592     //
  2663     /** Prints the contents of the buffer. */
  2593     /** Prints the contents of the buffer. */