corba/src/share/classes/com/sun/corba/se/impl/encoding/CodeSetConversion.java - edge/openjdk - Git at Google

 /*
  * Copyright (c) 2001, 2004, Oracle and/or its affiliates. All rights reserved.
  * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
  *
  * This code is free software; you can redistribute it and/or modify it
  * under the terms of the GNU General Public License version 2 only, as
  * published by the Free Software Foundation.  Oracle designates this
  * particular file as subject to the "Classpath" exception as provided
  * by Oracle in the LICENSE file that accompanied this code.
  *
  * This code is distributed in the hope that it will be useful, but WITHOUT
  * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
  * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
  * version 2 for more details (a copy is included in the LICENSE file that
  * accompanied this code).
  *
  * You should have received a copy of the GNU General Public License version
  * 2 along with this work; if not, write to the Free Software Foundation,
  * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
  *
  * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
  * or visit www.oracle.com if you need additional information or have any
  * questions.
  */
 package com.sun.corba.se.impl.encoding;

 import java.util.Map;
 import java.util.HashMap;
 import java.nio.ByteBuffer;
 import java.nio.CharBuffer;
 import java.nio.charset.Charset;
 import java.nio.charset.CharsetEncoder;
 import java.nio.charset.CharsetDecoder;
 import java.nio.charset.CharacterCodingException;
 import java.nio.charset.IllegalCharsetNameException;
 import java.nio.charset.MalformedInputException;
 import java.nio.charset.UnsupportedCharsetException;
 import java.nio.charset.UnmappableCharacterException;
 import com.sun.corba.se.impl.logging.ORBUtilSystemException;
 import com.sun.corba.se.impl.logging.OMGSystemException;
 import com.sun.corba.se.spi.logging.CORBALogDomains;

 /**
  * Collection of classes, interfaces, and factory methods for
  * CORBA code set conversion.
  *
  * This is mainly used to shield other code from the sun.io
  * converters which might change, as well as provide some basic
  * translation from conversion to CORBA error exceptions.  Some
  * extra work is required here to facilitate the way CORBA
  * says it uses UTF-16 as of the 00-11-03 spec.
  *
  * REVISIT - Since the nio.Charset and nio.Charset.Encoder/Decoder
  *           use NIO ByteBuffer and NIO CharBuffer, the interaction
  *           and interface between this class and the CDR streams
  *           should be looked at more closely for optimizations to
  *           avoid unnecessary copying of data between char[] &
  *           CharBuffer and byte[] & ByteBuffer, especially
  *           DirectByteBuffers.
  *
  */
 public class CodeSetConversion
 {
     /**
      * Abstraction for char to byte conversion.
      *
      * Must be used in the proper sequence:
      *
      * 1)  convert
      * 2)  Optional getNumBytes and/or getAlignment (if necessary)
      * 3)  getBytes (see warning)
      */
     public abstract static class CTBConverter
     {
         // Perform the conversion of the provided char or String,
         // allowing the caller to query for more information
         // before writing.
         public abstract void convert(char chToConvert);
         public abstract void convert(String strToConvert);

         // How many bytes resulted from the conversion?
         public abstract int getNumBytes();

         // What's the maximum number of bytes per character?
         public abstract float getMaxBytesPerChar();

         public abstract boolean isFixedWidthEncoding();

         // What byte boundary should the stream align to before
         // calling writeBytes?  For instance, a fixed width
         // encoding with 2 bytes per char in a stream which
         // doesn't encapsulate the char's bytes should align
         // on a 2 byte boundary.  (Ex:  UTF16 in GIOP1.1)
         //
         // Note: This has no effect on the converted bytes.  It
         // is just information available to the caller.
         public abstract int getAlignment();

         // Get the resulting bytes.  Warning:  You must use getNumBytes()
         // to determine the end of the data in the byte array instead
         // of array.length!  The array may be used internally, so don't
         // save references.
         public abstract byte[] getBytes();
     }

     /**
      * Abstraction for byte to char conversion.
      */
     public abstract static class BTCConverter
     {
         // In GIOP 1.1, interoperability can only be achieved with
         // fixed width encodings like UTF-16.  This is because wstrings
         // specified how many code points follow rather than specifying
         // the length in octets.
         public abstract boolean isFixedWidthEncoding();
         public abstract int getFixedCharWidth();

         // Called after getChars to determine the true size of the
         // converted array.
         public abstract int getNumChars();

         // Perform the conversion using length bytes from the given
         // input stream.  Warning:  You must use getNumChars() to
         // determine the correct length of the resulting array.
         // The same array may be used internally over multiple
         // calls.
         public abstract char[] getChars(byte[] bytes, int offset, int length);
     }

     /**
      * Implementation of CTBConverter which uses a nio.Charset.CharsetEncoder
      * to do the real work.  Handles translation of exceptions to the
      * appropriate CORBA versions.
      */
     private class JavaCTBConverter extends CTBConverter
     {
         private ORBUtilSystemException wrapper = ORBUtilSystemException.get(
             CORBALogDomains.RPC_ENCODING ) ;

         private OMGSystemException omgWrapper = OMGSystemException.get(
             CORBALogDomains.RPC_ENCODING ) ;

         // nio.Charset.CharsetEncoder actually does the work here
         // have to use it directly rather than through String's interface
         // because we want to know when errors occur during the conversion.
         private CharsetEncoder ctb;

         // Proper alignment for this type of converter.  For instance,
         // ASCII has alignment of 1 (1 byte per char) but UTF16 has
         // alignment of 2 (2 bytes per char)
         private int alignment;

         // Char buffer to hold the input.
         private char[] chars = null;

         // How many bytes are generated from the conversion?
         private int numBytes = 0;

         // How many characters were converted (temporary variable
         // for cross method communication)
         private int numChars = 0;

         // ByteBuffer holding the converted input.  This is necessary
         // since we have to do calculations that require the conversion
         // before writing the array to the stream.
         private ByteBuffer buffer;

         // What code set are we using?
         private OSFCodeSetRegistry.Entry codeset;

         public JavaCTBConverter(OSFCodeSetRegistry.Entry codeset,
                                 int alignmentForEncoding) {

             try {
                 ctb = cache.getCharToByteConverter(codeset.getName());
                 if (ctb == null) {
                     Charset tmpCharset = Charset.forName(codeset.getName());
                     ctb = tmpCharset.newEncoder();
                     cache.setConverter(codeset.getName(), ctb);
                 }
             } catch(IllegalCharsetNameException icne) {

                 // This can only happen if one of our Entries has
                 // an invalid name.
                 throw wrapper.invalidCtbConverterName(icne,codeset.getName());
             } catch(UnsupportedCharsetException ucne) {

                 // This can only happen if one of our Entries has
                 // an unsupported name.
                 throw wrapper.invalidCtbConverterName(ucne,codeset.getName());
             }

             this.codeset = codeset;
             alignment = alignmentForEncoding;
         }

         public final float getMaxBytesPerChar() {
             return ctb.maxBytesPerChar();
         }

         public void convert(char chToConvert) {
             if (chars == null)
                 chars = new char[1];

             // The CharToByteConverter only takes a char[]
             chars[0] = chToConvert;
             numChars = 1;

             convertCharArray();
         }

         public void convert(String strToConvert) {
             // Try to save a memory allocation if possible.  Usual
             // space/time trade off.  If we could get the char[] out of
             // the String without copying, that would be great, but
             // it's forbidden since String is immutable.
             if (chars == null || chars.length < strToConvert.length())
                 chars = new char[strToConvert.length()];

             numChars = strToConvert.length();

             strToConvert.getChars(0, numChars, chars, 0);

             convertCharArray();
         }

         public final int getNumBytes() {
             return numBytes;
         }

         public final int getAlignment() {
             return alignment;
         }

         public final boolean isFixedWidthEncoding() {
             return codeset.isFixedWidth();
         }

         public byte[] getBytes() {
             // Note that you can't use buffer.length since the buffer might
             // be larger than the actual number of converted bytes depending
             // on the encoding.
             return buffer.array();
         }

         private void convertCharArray() {
             try {

                 // Possible optimization of directly converting into the CDR buffer.
                 // However, that means the CDR code would have to reserve
                 // a 4 byte string length ahead of time, and we'd need a
                 // confusing partial conversion scheme for when we couldn't
                 // fit everything in the buffer but needed to know the
                 // converted length before proceeding due to fragmentation.
                 // Then there's the issue of the chunking code.
                 //
                 // For right now, this is less messy and basic tests don't
                 // show more than a 1 ms penalty worst case.  Less than a
                 // factor of 2 increase.

                 // Convert the characters
                 buffer = ctb.encode(CharBuffer.wrap(chars,0,numChars));

                 // ByteBuffer returned by the encoder will set its limit
                 // to byte immediately after the last written byte.
                 numBytes = buffer.limit();

             } catch (IllegalStateException ise) {
                 // an encoding operation is already in progress
                 throw wrapper.ctbConverterFailure( ise ) ;
             } catch (MalformedInputException mie) {
                 // There were illegal Unicode char pairs
                 throw wrapper.badUnicodePair( mie ) ;
             } catch (UnmappableCharacterException uce) {
                 // A character doesn't map to the desired code set
                 // CORBA formal 00-11-03.
                 throw omgWrapper.charNotInCodeset( uce ) ;
             } catch (CharacterCodingException cce) {
                 // If this happens, then some other encoding error occured
                 throw wrapper.ctbConverterFailure( cce ) ;
             }
         }
     }

     /**
      * Special UTF16 converter which can either always write a BOM
      * or use a specified byte order without one.
      */
     private class UTF16CTBConverter extends JavaCTBConverter
     {
         // Using this constructor, we will always write a BOM
         public UTF16CTBConverter() {
             super(OSFCodeSetRegistry.UTF_16, 2);
         }

         // Using this constructor, we don't use a BOM and use the
         // byte order specified
         public UTF16CTBConverter(boolean littleEndian) {
             super(littleEndian ?
                   OSFCodeSetRegistry.UTF_16LE :
                   OSFCodeSetRegistry.UTF_16BE,
                   2);
         }
     }

     /**
      * Implementation of BTCConverter which uses a sun.io.ByteToCharConverter
      * for the real work.  Handles translation of exceptions to the
      * appropriate CORBA versions.
      */
     private class JavaBTCConverter extends BTCConverter
     {
         private ORBUtilSystemException wrapper = ORBUtilSystemException.get(
             CORBALogDomains.RPC_ENCODING ) ;

         private OMGSystemException omgWrapper = OMGSystemException.get(
             CORBALogDomains.RPC_ENCODING ) ;

         protected CharsetDecoder btc;
         private char[] buffer;
         private int resultingNumChars;
         private OSFCodeSetRegistry.Entry codeset;

         public JavaBTCConverter(OSFCodeSetRegistry.Entry codeset) {

             // Obtain a Decoder
             btc = this.getConverter(codeset.getName());

             this.codeset = codeset;
         }

         public final boolean isFixedWidthEncoding() {
             return codeset.isFixedWidth();
         }

         // Should only be called if isFixedWidthEncoding is true
         // IMPORTANT: This calls OSFCodeSetRegistry.Entry, not
         //            CharsetDecoder.maxCharsPerByte().
         public final int getFixedCharWidth() {
             return codeset.getMaxBytesPerChar();
         }

         public final int getNumChars() {
             return resultingNumChars;
         }

         public char[] getChars(byte[] bytes, int offset, int numBytes) {

             // Possible optimization of reading directly from the CDR
             // byte buffer.  The sun.io converter supposedly can handle
             // incremental conversions in which a char is broken across
             // two convert calls.
             //
             // Basic tests didn't show more than a 1 ms increase
             // worst case.  It's less than a factor of 2 increase.
             // Also makes the interface more difficult.


             try {

                 ByteBuffer byteBuf = ByteBuffer.wrap(bytes, offset, numBytes);
                 CharBuffer charBuf = btc.decode(byteBuf);

                 // CharBuffer returned by the decoder will set its limit
                 // to byte immediately after the last written byte.
                 resultingNumChars = charBuf.limit();

                 // IMPORTANT - It's possible the underlying char[] in the
                 //             CharBuffer returned by btc.decode(byteBuf)
                 //             is longer in length than the number of characters
                 //             decoded. Hence, the check below to ensure the
                 //             char[] returned contains all the chars that have
                 //             been decoded and no more.
                 if (charBuf.limit() == charBuf.capacity()) {
                     buffer = charBuf.array();
                 } else {
                     buffer = new char[charBuf.limit()];
                     charBuf.get(buffer, 0, charBuf.limit()).position(0);
                 }

                 return buffer;

             } catch (IllegalStateException ile) {
                 // There were a decoding operation already in progress
                 throw wrapper.btcConverterFailure( ile ) ;
             } catch (MalformedInputException mie) {
                 // There were illegal Unicode char pairs
                 throw wrapper.badUnicodePair( mie ) ;
             } catch (UnmappableCharacterException uce) {
                 // A character doesn't map to the desired code set.
                 // CORBA formal 00-11-03.
                 throw omgWrapper.charNotInCodeset( uce ) ;
             } catch (CharacterCodingException cce) {
                 // If this happens, then a character decoding error occured.
                 throw wrapper.btcConverterFailure( cce ) ;
             }
         }

         /**
          * Utility method to find a CharsetDecoder in the
          * cache or create a new one if necessary.  Throws an
          * INTERNAL if the code set is unknown.
          */
         protected CharsetDecoder getConverter(String javaCodeSetName) {

             CharsetDecoder result = null;
             try {
                 result = cache.getByteToCharConverter(javaCodeSetName);

                 if (result == null) {
                     Charset tmpCharset = Charset.forName(javaCodeSetName);
                     result = tmpCharset.newDecoder();
                     cache.setConverter(javaCodeSetName, result);
                 }

             } catch(IllegalCharsetNameException icne) {
                 // This can only happen if one of our charset entries has
                 // an illegal name.
                 throw wrapper.invalidBtcConverterName( icne, javaCodeSetName ) ;
             }

             return result;
         }
     }

     /**
      * Special converter for UTF16 since it's required to optionally
      * support a byte order marker while the internal Java converters
      * either require it or require that it isn't there.
      *
      * The solution is to check for the byte order marker, and if we
      * need to do something differently, switch internal converters.
      */
     private class UTF16BTCConverter extends JavaBTCConverter
     {
         private boolean defaultToLittleEndian;
         private boolean converterUsesBOM = true;

         private static final char UTF16_BE_MARKER = (char) 0xfeff;
         private static final char UTF16_LE_MARKER = (char) 0xfffe;

         // When there isn't a byte order marker, used the byte
         // order specified.
         public UTF16BTCConverter(boolean defaultToLittleEndian) {
             super(OSFCodeSetRegistry.UTF_16);

             this.defaultToLittleEndian = defaultToLittleEndian;
         }

         public char[] getChars(byte[] bytes, int offset, int numBytes) {

             if (hasUTF16ByteOrderMarker(bytes, offset, numBytes)) {
                 if (!converterUsesBOM)
                     switchToConverter(OSFCodeSetRegistry.UTF_16);

                 converterUsesBOM = true;

                 return super.getChars(bytes, offset, numBytes);
             } else {
                 if (converterUsesBOM) {
                     if (defaultToLittleEndian)
                         switchToConverter(OSFCodeSetRegistry.UTF_16LE);
                     else
                         switchToConverter(OSFCodeSetRegistry.UTF_16BE);

                     converterUsesBOM = false;
                 }

                 return super.getChars(bytes, offset, numBytes);
             }
         }

         /**
          * Utility method for determining if a UTF-16 byte order marker is present.
          */
         private boolean hasUTF16ByteOrderMarker(byte[] array, int offset, int length) {
             // If there aren't enough bytes to represent the marker and data,
             // return false.
             if (length >= 4) {

                 int b1 = array[offset] & 0x00FF;
                 int b2 = array[offset + 1] & 0x00FF;

                 char marker = (char)((b1 << 8) | (b2 << 0));

                 return (marker == UTF16_BE_MARKER || marker == UTF16_LE_MARKER);
             } else
                 return false;
         }

         /**
          * The current solution for dealing with UTF-16 in CORBA
          * is that if our sun.io converter requires byte order markers,
          * and then we see a CORBA wstring/wchar without them, we
          * switch to the sun.io converter that doesn't require them.
          */
         private void switchToConverter(OSFCodeSetRegistry.Entry newCodeSet) {

             // Use the getConverter method from our superclass.
             btc = super.getConverter(newCodeSet.getName());
         }
     }

     /**
      * CTB converter factory for single byte or variable length encodings.
      */
     public CTBConverter getCTBConverter(OSFCodeSetRegistry.Entry codeset) {
         int alignment = (!codeset.isFixedWidth() ?
                          1 :
                          codeset.getMaxBytesPerChar());

         return new JavaCTBConverter(codeset, alignment);
     }

     /**
      * CTB converter factory for multibyte (mainly fixed) encodings.
      *
      * Because of the awkwardness with byte order markers and the possibility of
      * using UCS-2, you must specify both the endianness of the stream as well as
      * whether or not to use byte order markers if applicable.  UCS-2 has no byte
      * order markers.  UTF-16 has optional markers.
      *
      * If you select useByteOrderMarkers, there is no guarantee that the encoding
      * will use the endianness specified.
      *
      */
     public CTBConverter getCTBConverter(OSFCodeSetRegistry.Entry codeset,
                                         boolean littleEndian,
                                         boolean useByteOrderMarkers) {

         // UCS2 doesn't have byte order markers, and we're encoding it
         // as UTF-16 since UCS2 isn't available in all Java platforms.
         // They should be identical with only minor differences in
         // negative cases.
         if (codeset == OSFCodeSetRegistry.UCS_2)
             return new UTF16CTBConverter(littleEndian);

         // We can write UTF-16 with or without a byte order marker.
         if (codeset == OSFCodeSetRegistry.UTF_16) {
             if (useByteOrderMarkers)
                 return new UTF16CTBConverter();
             else
                 return new UTF16CTBConverter(littleEndian);
         }

         // Everything else uses the generic JavaCTBConverter.
         //
         // Variable width encodings are aligned on 1 byte boundaries.
         // A fixed width encoding with a max. of 4 bytes/char should
         // align on a 4 byte boundary.  Note that UTF-16 is a special
         // case because of the optional byte order marker, so it's
         // handled above.
         //
         // This doesn't matter for GIOP 1.2 wchars and wstrings
         // since the encoded bytes are treated as an encapsulation.
         int alignment = (!codeset.isFixedWidth() ?
                          1 :
                          codeset.getMaxBytesPerChar());

         return new JavaCTBConverter(codeset, alignment);
     }

     /**
      * BTCConverter factory for single byte or variable width encodings.
      */
     public BTCConverter getBTCConverter(OSFCodeSetRegistry.Entry codeset) {
         return new JavaBTCConverter(codeset);
     }

     /**
      * BTCConverter factory for fixed width multibyte encodings.
      */
     public BTCConverter getBTCConverter(OSFCodeSetRegistry.Entry codeset,
                                         boolean defaultToLittleEndian) {

         if (codeset == OSFCodeSetRegistry.UTF_16 ||
             codeset == OSFCodeSetRegistry.UCS_2) {

             return new UTF16BTCConverter(defaultToLittleEndian);
         } else {
             return new JavaBTCConverter(codeset);
         }
     }

     /**
      * Follows the code set negotiation algorithm in CORBA formal 99-10-07 13.7.2.
      *
      * Returns the proper negotiated OSF character encoding number or
      * CodeSetConversion.FALLBACK_CODESET.
      */
     private int selectEncoding(CodeSetComponentInfo.CodeSetComponent client,
                                CodeSetComponentInfo.CodeSetComponent server) {

         // A "null" value for the server's nativeCodeSet means that
         // the server desired not to indicate one.  We'll take that
         // to mean that it wants the first thing in its conversion list.
         // If it's conversion list is empty, too, then use the fallback
         // codeset.
         int serverNative = server.nativeCodeSet;

         if (serverNative == 0) {
             if (server.conversionCodeSets.length > 0)
                 serverNative = server.conversionCodeSets[0];
             else
                 return CodeSetConversion.FALLBACK_CODESET;
         }

         if (client.nativeCodeSet == serverNative) {
             // Best case -- client and server don't have to convert
             return serverNative;
         }

         // Is this client capable of converting to the server's
         // native code set?
         for (int i = 0; i < client.conversionCodeSets.length; i++) {
             if (serverNative == client.conversionCodeSets[i]) {
                 // The client will convert to the server's
                 // native code set.
                 return serverNative;
             }
         }

         // Is the server capable of converting to the client's
         // native code set?
         for (int i = 0; i < server.conversionCodeSets.length; i++) {
             if (client.nativeCodeSet == server.conversionCodeSets[i]) {
                 // The server will convert to the client's
                 // native code set.
                 return client.nativeCodeSet;
             }
         }

         // See if there are any code sets that both the server and client
         // support (giving preference to the server).  The order
         // of conversion sets is from most to least desired.
         for (int i = 0; i < server.conversionCodeSets.length; i++) {
             for (int y = 0; y < client.conversionCodeSets.length; y++) {
                 if (server.conversionCodeSets[i] == client.conversionCodeSets[y]) {
                     return server.conversionCodeSets[i];
                 }
             }
         }

         // Before using the fallback codesets, the spec calls for a
         // compatibility check on the native code sets.  It doesn't make
         // sense because loss free communication is always possible with
         // UTF8 and UTF16, the fall back code sets.  It's also a lot
         // of work to implement.  In the case of incompatibility, the
         // spec says to throw a CODESET_INCOMPATIBLE exception.

         // Use the fallback
         return CodeSetConversion.FALLBACK_CODESET;
     }

     /**
      * Perform the code set negotiation algorithm and come up with
      * the two encodings to use.
      */
     public CodeSetComponentInfo.CodeSetContext negotiate(CodeSetComponentInfo client,
                                                          CodeSetComponentInfo server) {
         int charData
             = selectEncoding(client.getCharComponent(),
                              server.getCharComponent());

         if (charData == CodeSetConversion.FALLBACK_CODESET) {
             charData = OSFCodeSetRegistry.UTF_8.getNumber();
         }

         int wcharData
             = selectEncoding(client.getWCharComponent(),
                              server.getWCharComponent());

         if (wcharData == CodeSetConversion.FALLBACK_CODESET) {
             wcharData = OSFCodeSetRegistry.UTF_16.getNumber();
         }

         return new CodeSetComponentInfo.CodeSetContext(charData,
                                                        wcharData);
     }

     // No one should instantiate a CodeSetConversion but the singleton
     // instance method
     private CodeSetConversion() {}

     // initialize-on-demand holder
     private static class CodeSetConversionHolder {
         static final CodeSetConversion csc = new CodeSetConversion() ;
     }

     /**
      * CodeSetConversion is a singleton, and this is the access point.
      */
     public final static CodeSetConversion impl() {
         return CodeSetConversionHolder.csc ;
     }

     // Singleton instance
     private static CodeSetConversion implementation;

     // Number used internally to indicate the fallback code
     // set.
     private static final int FALLBACK_CODESET = 0;

     // Provides a thread local cache for the sun.io
     // converters.
     private CodeSetCache cache = new CodeSetCache();
 }