ML Reference
MeVis/Foundation/Sources/MLUtilities/private/mlConvertUTF.h
Go to the documentation of this file.
00001 
00002 
00003 /*
00004  * Copyright 2001 Unicode, Inc.
00005  *
00006  * Disclaimer
00007  *
00008  * This source code is provided as is by Unicode, Inc. No claims are
00009  * made as to fitness for any particular purpose. No warranties of any
00010  * kind are expressed or implied. The recipient agrees to determine
00011  * applicability of information provided. If this file has been
00012  * purchased on magnetic or optical media from Unicode, Inc., the
00013  * sole remedy for any claim will be exchange of defective media
00014  * within 90 days of receipt.
00015  *
00016  * Limitations on Rights to Redistribute This Code
00017  *
00018  * Unicode, Inc. hereby grants the right to freely use the information
00019  * supplied in this file in the creation of products supporting the
00020  * Unicode Standard, and to make copies of this file in any form
00021  * for internal or external distribution as long as this notice
00022  * remains attached.
00023  */
00024 
00025 /* ---------------------------------------------------------------------
00026 
00027     Conversions between UTF32, UTF-16, and UTF-8.  Header file.
00028 
00029     Several functions are included here, forming a complete set of
00030     conversions between the three formats.  UTF-7 is not included
00031     here, but is handled in a separate source file.
00032 
00033     Each of these routines takes pointers to input buffers and output
00034     buffers. The input buffers are const.
00035 
00036     Each routine converts the text between *sourceStart and sourceEnd,
00037     putting the result into the buffer between *targetStart and
00038     targetEnd. Note: the end pointers are *after* the last item: e.g.,
00039     *(sourceEnd - 1) is the last item.
00040 
00041     The return result indicates whether the conversion was successful,
00042     and if not, whether the problem was in the source or target buffers.
00043     (Only the first encountered problem is indicated.)
00044 
00045     After the conversion, *sourceStart and *targetStart are both
00046     updated to point to the end of last text successfully converted in
00047     the respective buffers.
00048 
00049     General (parameter) errors:
00050     - If any illegal parameter (that means NULL pointers) is passed
00051       then in cases of sourceStart, sourceEnd or length "sourceIllegal"
00052       is returned. \n
00053       In case of NULL targetStart or targetEnd then "targetExhausted" is 
00054       returned. \n
00055       None of any parameter passed to those functions will 
00056       be changed in those cases.
00057     - In cases of caught exceptions "sourceIllegal" or "false" will be 
00058       returned after sending a fatal error message to the ML error handler.
00059       The state of the target buffers or return pointers is left 
00060       undefined then.
00061 
00062     Input parameters:
00063     sourceStart - pointer to a pointer to the source buffer.
00064     The contents of this are modified on return so that
00065     it points at the next thing to be converted.
00066     targetStart - similarly, pointer to pointer to the target buffer.
00067     sourceEnd, targetEnd - respectively pointers to the ends of the
00068     two buffers, for overflow checking only.
00069 
00070     These conversion functions take a ConversionFlags argument. When this
00071     flag is set to strict, both irregular sequences and isolated surrogates
00072     will cause an error.  When the flag is set to lenient, both irregular
00073     sequences and isolated surrogates are converted.
00074 
00075     Whether the flag is strict or lenient, all illegal sequences will cause
00076     an error return. This includes sequences such as: <F4 90 80 80>, <C0 80>,
00077     or <A0> in UTF-8, and values above 0x10FFFF in UTF-32. Conform code
00078     must check for illegal sequences.
00079 
00080     When the flag is set to lenient, characters over 0x10FFFF are converted
00081     to the replacement character; otherwise (when the flag is set to strict)
00082     they constitute an error.
00083 
00084     Output parameters:
00085     The value "sourceIllegal" is returned from some routines if the input
00086     sequence is malformed.  When "sourceIllegal" is returned, the source
00087     value will point to the illegal value that caused the problem. E.g.,
00088     in UTF-8 when a sequence is malformed, it points to the start of the
00089     malformed sequence.
00090 
00091     Author: Mark E. Davis, 1994.
00092     Rev History: Rick McGowan, fixes & updates May 2001.
00093     Fixes & updates, Sept 2001.
00094 
00095 ------------------------------------------------------------------------ */
00096 
00097 //--------------------------------------------------------------------
00113 //--------------------------------------------------------------------
00114 
00115 #ifndef __mlConvertUTF_H
00116 #define __mlConvertUTF_H
00117 
00118 #ifndef __mlTypeDefs_H
00119 #include "mlTypeDefs.h"
00120 #endif
00121 
00122 //--------------------------------------------------------------------
00128 //--------------------------------------------------------------------
00129 typedef MLuint32 UTF32;   
00130 typedef MLuint16 UTF16;   
00131 typedef MLuint8  UTF8;    
00132 
00133 
00134 //--------------------------------------------------------------------
00136 //--------------------------------------------------------------------
00137 #define UNI_REPLACEMENT_CHAR (static_cast<UTF32>(0x0000FFFD))  //!< Used instead of invalid characters on lenient conversion.
00138 #define UNI_MAX_BMP          (static_cast<UTF32>(0x0000FFFF))
00139 #define UNI_MAX_UTF16        (static_cast<UTF32>(0x0010FFFF))
00140 #define UNI_MAX_UTF32        (static_cast<UTF32>(0x7FFFFFFF))
00141 
00142 
00143 //--------------------------------------------------------------------
00145 //--------------------------------------------------------------------
00146 typedef enum {
00147   conversionOK,     
00148   sourceExhausted,  
00149   targetExhausted,  
00150   sourceIllegal     
00151 } ConversionResult;
00152 
00153 //--------------------------------------------------------------------
00155 //--------------------------------------------------------------------
00156 typedef enum {
00157   strictConversion = 0, 
00158   lenientConversion     
00159 } ConversionFlags;
00160 
00161 //--------------------------------------------------------------------
00162 /* This is for C++ and does no harm in C */
00163 //--------------------------------------------------------------------
00164 #ifdef __cplusplus
00165 extern "C" {
00166 #endif
00167 
00168 //---------------------------------------
00170 
00171 //---------------------------------------
00172 
00176 ConversionResult ConvertUTF32toUTF16 (const UTF32   **sourceStart,
00177                                       const UTF32    *sourceEnd,
00178                                       UTF16         **targetStart,
00179                                       UTF16          *targetEnd,
00180                                       ConversionFlags flags);
00181 
00185 ConversionResult ConvertUTF16toUTF32 (const UTF16   **sourceStart,
00186                                       const UTF16    *sourceEnd,
00187                                       UTF32         **targetStart,
00188                                       UTF32          *targetEnd,
00189                                       ConversionFlags flags);
00190 
00194 ConversionResult ConvertUTF16toUTF8 ( const UTF16   **sourceStart, 
00195                                       const UTF16    *sourceEnd,
00196                                       UTF8          **targetStart, 
00197                                       UTF8           *targetEnd,
00198                                       ConversionFlags flags);
00199 
00203 ConversionResult ConvertUTF8toUTF16 ( const UTF8    **sourceStart,
00204                                       const UTF8     *sourceEnd,
00205                                       UTF16         **targetStart, 
00206                                       UTF16          *targetEnd, 
00207                                       ConversionFlags flags);
00208 
00212 ConversionResult ConvertUTF32toUTF8  (const UTF32   **sourceStart,
00213                                       const UTF32    *sourceEnd,
00214                                       UTF8          **targetStart, 
00215                                       UTF8           *targetEnd, 
00216                                       ConversionFlags flags);
00217 
00221 ConversionResult ConvertUTF8toUTF32 ( const UTF8    **sourceStart,
00222                                       const UTF8     *sourceEnd,
00223                                       UTF32         **targetStart, 
00224                                       UTF32          *targetEnd);
00225 
00226 
00227 
00228 //---------------------------------------
00230 
00231 //---------------------------------------
00236 ConversionResult ConvertUTF8toLatin1 (const UTF8     *sourceStart,
00237                                       char           *targetStart,
00238                                       char           *targetEnd,
00239                                       ConversionFlags flags);
00240 
00246 ConversionResult CalculateNumCharsInUTF8 (const UTF8     *sourceStart,
00247                                           unsigned int   *length,
00248                                           ConversionFlags flags);
00249 
00256 ConversionResult CalculateUTF16BufferSizeForUTF8 (const UTF8     *sourceStart,
00257                                                   unsigned int   *length,
00258                                                   ConversionFlags flags);
00260 
00261 
00262 
00263 #ifdef __cplusplus
00264 }
00265 #endif
00266 
00267 /* --------------------------------------------------------------------- */
00268 
00269 #endif // __mlConvertUTF_H
00270 
00271