jaxp/src/com/sun/org/apache/xml/internal/serialize/HTMLdtd.java
changeset 12457 c348e06f0e82
parent 6 7f561c08de6b
child 25834 aba3efbf4ec5
equal deleted inserted replaced
12324:1d7e6da6adc8 12457:c348e06f0e82
       
     1 /*
       
     2  * reserved comment block
       
     3  * DO NOT REMOVE OR ALTER!
       
     4  */
       
     5 /*
       
     6  * Copyright 1999-2002,2004 The Apache Software Foundation.
       
     7  *
       
     8  * Licensed under the Apache License, Version 2.0 (the "License");
       
     9  * you may not use this file except in compliance with the License.
       
    10  * You may obtain a copy of the License at
       
    11  *
       
    12  *      http://www.apache.org/licenses/LICENSE-2.0
       
    13  *
       
    14  * Unless required by applicable law or agreed to in writing, software
       
    15  * distributed under the License is distributed on an "AS IS" BASIS,
       
    16  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
       
    17  * See the License for the specific language governing permissions and
       
    18  * limitations under the License.
       
    19  */
       
    20 
       
    21 
       
    22 // Aug 21, 2000:
       
    23 //   Fixed bug in isElement and made HTMLdtd public.
       
    24 //   Contributed by Eric SCHAEFFER" <eschaeffer@posterconseil.com>
       
    25 
       
    26 
       
    27 package com.sun.org.apache.xml.internal.serialize;
       
    28 
       
    29 import com.sun.org.apache.xerces.internal.dom.DOMMessageFormatter;
       
    30 
       
    31 import java.io.InputStream;
       
    32 import java.io.InputStreamReader;
       
    33 import java.io.BufferedReader;
       
    34 import java.util.Hashtable;
       
    35 import java.util.Locale;
       
    36 
       
    37 
       
    38 /**
       
    39  * Utility class for accessing information specific to HTML documents.
       
    40  * The HTML DTD is expressed as three utility function groups. Two methods
       
    41  * allow for checking whether an element requires an open tag on printing
       
    42  * ({@link #isEmptyTag}) or on parsing ({@link #isOptionalClosing}).
       
    43  * <P>
       
    44  * Two other methods translate character references from name to value and
       
    45  * from value to name. A small entities resource is loaded into memory the
       
    46  * first time any of these methods is called for fast and efficient access.
       
    47  *
       
    48  *
       
    49  * @author <a href="mailto:arkin@intalio.com">Assaf Arkin</a>
       
    50  */
       
    51 public final class HTMLdtd
       
    52 {
       
    53 
       
    54     /**
       
    55      * Public identifier for HTML 4.01 (Strict) document type.
       
    56      */
       
    57     public static final String HTMLPublicId = "-//W3C//DTD HTML 4.01//EN";
       
    58 
       
    59     /**
       
    60      * System identifier for HTML 4.01 (Strict) document type.
       
    61      */
       
    62     public static final String HTMLSystemId =
       
    63         "http://www.w3.org/TR/html4/strict.dtd";
       
    64 
       
    65     /**
       
    66      * Public identifier for XHTML 1.0 (Strict) document type.
       
    67      */
       
    68     public static final String XHTMLPublicId =
       
    69         "-//W3C//DTD XHTML 1.0 Strict//EN";
       
    70 
       
    71     /**
       
    72      * System identifier for XHTML 1.0 (Strict) document type.
       
    73      */
       
    74     public static final String XHTMLSystemId =
       
    75         "http://www.w3.org/TR/xhtml1/DTD/xhtml1-strict.dtd";
       
    76 
       
    77     /**
       
    78      * Table of reverse character reference mapping. Character codes are held
       
    79      * as single-character strings, mapped to their reference name.
       
    80      */
       
    81     private static Hashtable        _byChar;
       
    82 
       
    83 
       
    84     /**
       
    85      * Table of entity name to value mapping. Entities are held as strings,
       
    86      * character references as <TT>Character</TT> objects.
       
    87      */
       
    88     private static Hashtable        _byName;
       
    89 
       
    90 
       
    91     private static Hashtable        _boolAttrs;
       
    92 
       
    93 
       
    94     /**
       
    95      * Holds element definitions.
       
    96      */
       
    97     private static Hashtable        _elemDefs;
       
    98 
       
    99 
       
   100     /**
       
   101      * Locates the HTML entities file that is loaded upon initialization.
       
   102      * This file is a resource loaded with the default class loader.
       
   103      */
       
   104     private static final String     ENTITIES_RESOURCE = "HTMLEntities.res";
       
   105 
       
   106 
       
   107     /**
       
   108      * Only opening tag should be printed.
       
   109      */
       
   110     private static final int ONLY_OPENING = 0x0001;
       
   111 
       
   112     /**
       
   113      * Element contains element content only.
       
   114      */
       
   115     private static final int ELEM_CONTENT = 0x0002;
       
   116 
       
   117 
       
   118     /**
       
   119      * Element preserve spaces.
       
   120      */
       
   121     private static final int PRESERVE     = 0x0004;
       
   122 
       
   123 
       
   124     /**
       
   125      * Optional closing tag.
       
   126      */
       
   127     private static final int OPT_CLOSING  = 0x0008;
       
   128 
       
   129 
       
   130     /**
       
   131      * Element is empty (also means only opening tag)
       
   132      */
       
   133     private static final int EMPTY        = 0x0010 | ONLY_OPENING;
       
   134 
       
   135 
       
   136     /**
       
   137      * Allowed to appear in head.
       
   138      */
       
   139     private static final int ALLOWED_HEAD = 0x0020;
       
   140 
       
   141 
       
   142     /**
       
   143      * When opened, closes P.
       
   144      */
       
   145     private static final int CLOSE_P      = 0x0040;
       
   146 
       
   147 
       
   148     /**
       
   149      * When opened, closes DD or DT.
       
   150      */
       
   151     private static final int CLOSE_DD_DT  = 0x0080;
       
   152 
       
   153 
       
   154     /**
       
   155      * When opened, closes itself.
       
   156      */
       
   157     private static final int CLOSE_SELF   = 0x0100;
       
   158 
       
   159 
       
   160     /**
       
   161      * When opened, closes another table section.
       
   162      */
       
   163     private static final int CLOSE_TABLE  = 0x0200;
       
   164 
       
   165 
       
   166     /**
       
   167      * When opened, closes TH or TD.
       
   168      */
       
   169     private static final int CLOSE_TH_TD  = 0x04000;
       
   170 
       
   171 
       
   172     /**
       
   173      * Returns true if element is declared to be empty. HTML elements are
       
   174      * defines as empty in the DTD, not by the document syntax.
       
   175      *
       
   176      * @param tagName The element tag name (upper case)
       
   177      * @return True if element is empty
       
   178      */
       
   179     public static boolean isEmptyTag( String tagName )
       
   180     {
       
   181         return isElement( tagName, EMPTY );
       
   182     }
       
   183 
       
   184 
       
   185     /**
       
   186      * Returns true if element is declared to have element content.
       
   187      * Whitespaces appearing inside element content will be ignored,
       
   188      * other text will simply report an error.
       
   189      *
       
   190      * @param tagName The element tag name (upper case)
       
   191      * @return True if element content
       
   192      */
       
   193     public static boolean isElementContent( String tagName )
       
   194     {
       
   195         return isElement( tagName, ELEM_CONTENT );
       
   196     }
       
   197 
       
   198 
       
   199     /**
       
   200      * Returns true if element's textual contents preserves spaces.
       
   201      * This only applies to PRE and TEXTAREA, all other HTML elements
       
   202      * do not preserve space.
       
   203      *
       
   204      * @param tagName The element tag name (upper case)
       
   205      * @return True if element's text content preserves spaces
       
   206      */
       
   207     public static boolean isPreserveSpace( String tagName )
       
   208     {
       
   209         return isElement( tagName, PRESERVE );
       
   210     }
       
   211 
       
   212 
       
   213     /**
       
   214      * Returns true if element's closing tag is optional and need not
       
   215      * exist. An error will not be reported for such elements if they
       
   216      * are not closed. For example, <tt>LI</tt> is most often not closed.
       
   217      *
       
   218      * @param tagName The element tag name (upper case)
       
   219      * @return True if closing tag implied
       
   220      */
       
   221     public static boolean isOptionalClosing( String tagName )
       
   222     {
       
   223         return isElement( tagName, OPT_CLOSING );
       
   224     }
       
   225 
       
   226 
       
   227     /**
       
   228      * Returns true if element's closing tag is generally not printed.
       
   229      * For example, <tt>LI</tt> should not print the closing tag.
       
   230      *
       
   231      * @param tagName The element tag name (upper case)
       
   232      * @return True if only opening tag should be printed
       
   233      */
       
   234     public static boolean isOnlyOpening( String tagName )
       
   235     {
       
   236         return isElement( tagName, ONLY_OPENING );
       
   237     }
       
   238 
       
   239 
       
   240     /**
       
   241      * Returns true if the opening of one element (<tt>tagName</tt>) implies
       
   242      * the closing of another open element (<tt>openTag</tt>). For example,
       
   243      * every opening <tt>LI</tt> will close the previously open <tt>LI</tt>,
       
   244      * and every opening <tt>BODY</tt> will close the previously open <tt>HEAD</tt>.
       
   245      *
       
   246      * @param tagName The newly opened element
       
   247      * @param openTag The already opened element
       
   248      * @return True if closing tag closes opening tag
       
   249      */
       
   250     public static boolean isClosing( String tagName, String openTag )
       
   251     {
       
   252         // Several elements are defined as closing the HEAD
       
   253         if ( openTag.equalsIgnoreCase( "HEAD" ) )
       
   254             return ! isElement( tagName, ALLOWED_HEAD );
       
   255         // P closes iteself
       
   256         if ( openTag.equalsIgnoreCase( "P" ) )
       
   257             return isElement( tagName, CLOSE_P );
       
   258         // DT closes DD, DD closes DT
       
   259         if ( openTag.equalsIgnoreCase( "DT" ) || openTag.equalsIgnoreCase( "DD" ) )
       
   260             return isElement( tagName, CLOSE_DD_DT );
       
   261         // LI and OPTION close themselves
       
   262         if ( openTag.equalsIgnoreCase( "LI" ) || openTag.equalsIgnoreCase( "OPTION" ) )
       
   263             return isElement( tagName, CLOSE_SELF );
       
   264         // Each of these table sections closes all the others
       
   265         if ( openTag.equalsIgnoreCase( "THEAD" ) || openTag.equalsIgnoreCase( "TFOOT" ) ||
       
   266              openTag.equalsIgnoreCase( "TBODY" ) || openTag.equalsIgnoreCase( "TR" ) ||
       
   267              openTag.equalsIgnoreCase( "COLGROUP" ) )
       
   268             return isElement( tagName, CLOSE_TABLE );
       
   269         // TD closes TH and TH closes TD
       
   270         if ( openTag.equalsIgnoreCase( "TH" ) || openTag.equalsIgnoreCase( "TD" ) )
       
   271             return isElement( tagName, CLOSE_TH_TD );
       
   272         return false;
       
   273     }
       
   274 
       
   275 
       
   276     /**
       
   277      * Returns true if the specified attribute it a URI and should be
       
   278      * escaped appropriately. In HTML URIs are escaped differently
       
   279      * than normal attributes.
       
   280      *
       
   281      * @param tagName The element's tag name
       
   282      * @param attrName The attribute's name
       
   283      */
       
   284     public static boolean isURI( String tagName, String attrName )
       
   285     {
       
   286         // Stupid checks.
       
   287         return ( attrName.equalsIgnoreCase( "href" ) || attrName.equalsIgnoreCase( "src" ) );
       
   288     }
       
   289 
       
   290 
       
   291     /**
       
   292      * Returns true if the specified attribute is a boolean and should be
       
   293      * printed without the value. This applies to attributes that are true
       
   294      * if they exist, such as selected (OPTION/INPUT).
       
   295      *
       
   296      * @param tagName The element's tag name
       
   297      * @param attrName The attribute's name
       
   298      */
       
   299     public static boolean isBoolean( String tagName, String attrName )
       
   300     {
       
   301         String[] attrNames;
       
   302 
       
   303         attrNames = (String[]) _boolAttrs.get( tagName.toUpperCase(Locale.ENGLISH) );
       
   304         if ( attrNames == null )
       
   305             return false;
       
   306         for ( int i = 0 ; i < attrNames.length ; ++i )
       
   307             if ( attrNames[ i ].equalsIgnoreCase( attrName ) )
       
   308                 return true;
       
   309         return false;
       
   310     }
       
   311 
       
   312 
       
   313     /**
       
   314      * Returns the value of an HTML character reference by its name. If the
       
   315      * reference is not found or was not defined as a character reference,
       
   316      * returns EOF (-1).
       
   317      *
       
   318      * @param name Name of character reference
       
   319      * @return Character code or EOF (-1)
       
   320      */
       
   321     public static int charFromName( String name )
       
   322     {
       
   323         Object    value;
       
   324 
       
   325         initialize();
       
   326         value = _byName.get( name );
       
   327         if ( value != null && value instanceof Integer )
       
   328             return ( (Integer) value ).intValue();
       
   329         else
       
   330             return -1;
       
   331     }
       
   332 
       
   333 
       
   334     /**
       
   335      * Returns the name of an HTML character reference based on its character
       
   336      * value. Only valid for entities defined from character references. If no
       
   337      * such character value was defined, return null.
       
   338      *
       
   339      * @param value Character value of entity
       
   340      * @return Entity's name or null
       
   341      */
       
   342     public static String fromChar(int value )
       
   343     {
       
   344        if (value > 0xffff)
       
   345             return null;
       
   346 
       
   347         String name;
       
   348 
       
   349         initialize();
       
   350         name = (String) _byChar.get( new Integer( value ) );
       
   351         return name;
       
   352     }
       
   353 
       
   354 
       
   355     /**
       
   356      * Initialize upon first access. Will load all the HTML character references
       
   357      * into a list that is accessible by name or character value and is optimized
       
   358      * for character substitution. This method may be called any number of times
       
   359      * but will execute only once.
       
   360      */
       
   361     private static void initialize()
       
   362     {
       
   363         InputStream     is = null;
       
   364         BufferedReader  reader = null;
       
   365         int             index;
       
   366         String          name;
       
   367         String          value;
       
   368         int             code;
       
   369         String          line;
       
   370 
       
   371         // Make sure not to initialize twice.
       
   372         if ( _byName != null )
       
   373             return;
       
   374         try {
       
   375             _byName = new Hashtable();
       
   376             _byChar = new Hashtable();
       
   377             is = HTMLdtd.class.getResourceAsStream( ENTITIES_RESOURCE );
       
   378             if ( is == null ) {
       
   379                 throw new RuntimeException(
       
   380                                     DOMMessageFormatter.formatMessage(
       
   381                                     DOMMessageFormatter.SERIALIZER_DOMAIN,
       
   382                     "ResourceNotFound", new Object[] {ENTITIES_RESOURCE}));
       
   383             }
       
   384             reader = new BufferedReader( new InputStreamReader( is, "ASCII" ) );
       
   385             line = reader.readLine();
       
   386             while ( line != null ) {
       
   387                 if ( line.length() == 0 || line.charAt( 0 ) == '#' ) {
       
   388                     line = reader.readLine();
       
   389                     continue;
       
   390                 }
       
   391                 index = line.indexOf( ' ' );
       
   392                 if ( index > 1 ) {
       
   393                     name = line.substring( 0, index );
       
   394                     ++index;
       
   395                     if ( index < line.length() ) {
       
   396                         value = line.substring( index );
       
   397                         index = value.indexOf( ' ' );
       
   398                         if ( index > 0 )
       
   399                             value = value.substring( 0, index );
       
   400                         code = Integer.parseInt( value );
       
   401                                         defineEntity( name, (char) code );
       
   402                     }
       
   403                 }
       
   404                 line = reader.readLine();
       
   405             }
       
   406             is.close();
       
   407         }  catch ( Exception except ) {
       
   408                         throw new RuntimeException(
       
   409                                 DOMMessageFormatter.formatMessage(
       
   410                                 DOMMessageFormatter.SERIALIZER_DOMAIN,
       
   411                 "ResourceNotLoaded", new Object[] {ENTITIES_RESOURCE, except.toString()}));
       
   412         } finally {
       
   413             if ( is != null ) {
       
   414                 try {
       
   415                     is.close();
       
   416                 } catch ( Exception except ) { }
       
   417             }
       
   418         }
       
   419     }
       
   420 
       
   421 
       
   422     /**
       
   423      * Defines a new character reference. The reference's name and value are
       
   424      * supplied. Nothing happens if the character reference is already defined.
       
   425      * <P>
       
   426      * Unlike internal entities, character references are a string to single
       
   427      * character mapping. They are used to map non-ASCII characters both on
       
   428      * parsing and printing, primarily for HTML documents. '&lt;amp;' is an
       
   429      * example of a character reference.
       
   430      *
       
   431      * @param name The entity's name
       
   432      * @param value The entity's value
       
   433      */
       
   434     private static void defineEntity( String name, char value )
       
   435     {
       
   436         if ( _byName.get( name ) == null ) {
       
   437             _byName.put( name, new Integer( value ) );
       
   438             _byChar.put( new Integer( value ), name );
       
   439         }
       
   440     }
       
   441 
       
   442 
       
   443     private static void defineElement( String name, int flags )
       
   444     {
       
   445         _elemDefs.put( name, new Integer( flags ) );
       
   446     }
       
   447 
       
   448 
       
   449     private static void defineBoolean( String tagName, String attrName )
       
   450     {
       
   451         defineBoolean( tagName, new String[] { attrName } );
       
   452     }
       
   453 
       
   454 
       
   455     private static void defineBoolean( String tagName, String[] attrNames )
       
   456     {
       
   457         _boolAttrs.put( tagName, attrNames );
       
   458     }
       
   459 
       
   460 
       
   461     private static boolean isElement( String name, int flag )
       
   462     {
       
   463         Integer flags;
       
   464 
       
   465         flags = (Integer) _elemDefs.get( name.toUpperCase(Locale.ENGLISH) );
       
   466         if ( flags == null )
       
   467             return false;
       
   468         else
       
   469             return ( ( flags.intValue() & flag ) == flag );
       
   470     }
       
   471 
       
   472 
       
   473     static
       
   474     {
       
   475         _elemDefs = new Hashtable();
       
   476         defineElement( "ADDRESS", CLOSE_P );
       
   477         defineElement( "AREA", EMPTY );
       
   478         defineElement( "BASE",  EMPTY | ALLOWED_HEAD );
       
   479         defineElement( "BASEFONT", EMPTY );
       
   480         defineElement( "BLOCKQUOTE", CLOSE_P );
       
   481         defineElement( "BODY", OPT_CLOSING );
       
   482         defineElement( "BR", EMPTY );
       
   483         defineElement( "COL", EMPTY );
       
   484         defineElement( "COLGROUP", ELEM_CONTENT | OPT_CLOSING | CLOSE_TABLE );
       
   485         defineElement( "DD", OPT_CLOSING | ONLY_OPENING | CLOSE_DD_DT );
       
   486         defineElement( "DIV", CLOSE_P );
       
   487         defineElement( "DL", ELEM_CONTENT | CLOSE_P );
       
   488         defineElement( "DT", OPT_CLOSING | ONLY_OPENING | CLOSE_DD_DT );
       
   489         defineElement( "FIELDSET", CLOSE_P );
       
   490         defineElement( "FORM", CLOSE_P );
       
   491         defineElement( "FRAME", EMPTY | OPT_CLOSING );
       
   492         defineElement( "H1", CLOSE_P );
       
   493         defineElement( "H2", CLOSE_P );
       
   494         defineElement( "H3", CLOSE_P );
       
   495         defineElement( "H4", CLOSE_P );
       
   496         defineElement( "H5", CLOSE_P );
       
   497         defineElement( "H6", CLOSE_P );
       
   498         defineElement( "HEAD", ELEM_CONTENT | OPT_CLOSING );
       
   499         defineElement( "HR", EMPTY | CLOSE_P );
       
   500         defineElement( "HTML", ELEM_CONTENT | OPT_CLOSING );
       
   501         defineElement( "IMG", EMPTY );
       
   502         defineElement( "INPUT", EMPTY );
       
   503         defineElement( "ISINDEX", EMPTY | ALLOWED_HEAD );
       
   504         defineElement( "LI", OPT_CLOSING | ONLY_OPENING | CLOSE_SELF );
       
   505         defineElement( "LINK", EMPTY | ALLOWED_HEAD );
       
   506         defineElement( "MAP", ALLOWED_HEAD );
       
   507         defineElement( "META", EMPTY | ALLOWED_HEAD );
       
   508         defineElement( "OL", ELEM_CONTENT | CLOSE_P );
       
   509         defineElement( "OPTGROUP", ELEM_CONTENT );
       
   510         defineElement( "OPTION", OPT_CLOSING | ONLY_OPENING | CLOSE_SELF );
       
   511         defineElement( "P", OPT_CLOSING | CLOSE_P | CLOSE_SELF );
       
   512         defineElement( "PARAM", EMPTY );
       
   513         defineElement( "PRE", PRESERVE | CLOSE_P );
       
   514         defineElement( "SCRIPT", ALLOWED_HEAD | PRESERVE );
       
   515         defineElement( "NOSCRIPT", ALLOWED_HEAD | PRESERVE );
       
   516         defineElement( "SELECT", ELEM_CONTENT );
       
   517         defineElement( "STYLE", ALLOWED_HEAD | PRESERVE );
       
   518         defineElement( "TABLE", ELEM_CONTENT | CLOSE_P );
       
   519         defineElement( "TBODY", ELEM_CONTENT | OPT_CLOSING | CLOSE_TABLE );
       
   520         defineElement( "TD", OPT_CLOSING | CLOSE_TH_TD );
       
   521         defineElement( "TEXTAREA", PRESERVE );
       
   522         defineElement( "TFOOT", ELEM_CONTENT | OPT_CLOSING | CLOSE_TABLE );
       
   523         defineElement( "TH", OPT_CLOSING | CLOSE_TH_TD );
       
   524         defineElement( "THEAD", ELEM_CONTENT | OPT_CLOSING | CLOSE_TABLE );
       
   525         defineElement( "TITLE", ALLOWED_HEAD );
       
   526         defineElement( "TR", ELEM_CONTENT | OPT_CLOSING | CLOSE_TABLE );
       
   527         defineElement( "UL", ELEM_CONTENT | CLOSE_P );
       
   528 
       
   529         _boolAttrs = new Hashtable();
       
   530         defineBoolean( "AREA", "href" );
       
   531         defineBoolean( "BUTTON", "disabled" );
       
   532         defineBoolean( "DIR", "compact" );
       
   533         defineBoolean( "DL", "compact" );
       
   534         defineBoolean( "FRAME", "noresize" );
       
   535         defineBoolean( "HR", "noshade" );
       
   536         defineBoolean( "IMAGE", "ismap" );
       
   537         defineBoolean( "INPUT", new String[] { "defaultchecked", "checked", "readonly", "disabled" } );
       
   538         defineBoolean( "LINK", "link" );
       
   539         defineBoolean( "MENU", "compact" );
       
   540         defineBoolean( "OBJECT", "declare" );
       
   541         defineBoolean( "OL", "compact" );
       
   542         defineBoolean( "OPTGROUP", "disabled" );
       
   543         defineBoolean( "OPTION", new String[] { "default-selected", "selected", "disabled" } );
       
   544         defineBoolean( "SCRIPT", "defer" );
       
   545         defineBoolean( "SELECT", new String[] { "multiple", "disabled" } );
       
   546         defineBoolean( "STYLE", "disabled" );
       
   547         defineBoolean( "TD", "nowrap" );
       
   548         defineBoolean( "TH", "nowrap" );
       
   549         defineBoolean( "TEXTAREA", new String[] { "disabled", "readonly" } );
       
   550         defineBoolean( "UL", "compact" );
       
   551 
       
   552         initialize();
       
   553     }
       
   554 
       
   555 
       
   556 
       
   557 }