SDK/MLReference/mlConvertUTF_8h_source.html

00001
00002
00003 /*
00004  * Copyright 2001 Unicode, Inc.
00005  *
00006  * Disclaimer
00007  *
00008  * This source code is provided as is by Unicode, Inc. No claims are
00009  * made as to fitness for any particular purpose. No warranties of any
00010  * kind are expressed or implied. The recipient agrees to determine
00011  * applicability of information provided. If this file has been
00012  * purchased on magnetic or optical media from Unicode, Inc., the
00013  * sole remedy for any claim will be exchange of defective media
00014  * within 90 days of receipt.
00015  *
00016  * Limitations on Rights to Redistribute This Code
00017  *
00018  * Unicode, Inc. hereby grants the right to freely use the information
00019  * supplied in this file in the creation of products supporting the
00020  * Unicode Standard, and to make copies of this file in any form
00021  * for internal or external distribution as long as this notice
00022  * remains attached.
00023  */
00024
00025 /* ---------------------------------------------------------------------
00026
00027     Conversions between UTF32, UTF-16, and UTF-8.  Header file.
00028
00029     Several functions are included here, forming a complete set of
00030     conversions between the three formats.  UTF-7 is not included
00031     here, but is handled in a separate source file.
00032
00033     Each of these routines takes pointers to input buffers and output
00034     buffers. The input buffers are const.
00035
00036     Each routine converts the text between *sourceStart and sourceEnd,
00037     putting the result into the buffer between *targetStart and
00038     targetEnd. Note: the end pointers are *after* the last item: e.g.,
00039     *(sourceEnd - 1) is the last item.
00040
00041     The return result indicates whether the conversion was successful,
00042     and if not, whether the problem was in the source or target buffers.
00043     (Only the first encountered problem is indicated.)
00044
00045     After the conversion, *sourceStart and *targetStart are both
00046     updated to point to the end of last text successfully converted in
00047     the respective buffers.
00048
00049     General (parameter) errors:
00050     - If any illegal parameter (that means NULL pointers) is passed
00051       then in cases of sourceStart, sourceEnd or length "sourceIllegal"
00052       is returned. \n
00053       In case of NULL targetStart or targetEnd then "targetExhausted" is
00054       returned. \n
00055       None of any parameter passed to those functions will
00056       be changed in those cases.
00057     - In cases of caught exceptions "sourceIllegal" or "false" will be
00058       returned after sending a fatal error message to the ML error handler.
00059       The state of the target buffers or return pointers is left
00060       undefined then.
00061
00062     Input parameters:
00063     sourceStart - pointer to a pointer to the source buffer.
00064     The contents of this are modified on return so that
00065     it points at the next thing to be converted.
00066     targetStart - similarly, pointer to pointer to the target buffer.
00067     sourceEnd, targetEnd - respectively pointers to the ends of the
00068     two buffers, for overflow checking only.
00069
00070     These conversion functions take a ConversionFlags argument. When this
00071     flag is set to strict, both irregular sequences and isolated surrogates
00072     will cause an error.  When the flag is set to lenient, both irregular
00073     sequences and isolated surrogates are converted.
00074
00075     Whether the flag is strict or lenient, all illegal sequences will cause
00076     an error return. This includes sequences such as: <F4 90 80 80>, <C0 80>,
00077     or <A0> in UTF-8, and values above 0x10FFFF in UTF-32. Conform code
00078     must check for illegal sequences.
00079
00080     When the flag is set to lenient, characters over 0x10FFFF are converted
00081     to the replacement character; otherwise (when the flag is set to strict)
00082     they constitute an error.
00083
00084     Output parameters:
00085     The value "sourceIllegal" is returned from some routines if the input
00086     sequence is malformed.  When "sourceIllegal" is returned, the source
00087     value will point to the illegal value that caused the problem. E.g.,
00088     in UTF-8 when a sequence is malformed, it points to the start of the
00089     malformed sequence.
00090
00091     Author: Mark E. Davis, 1994.
00092     Rev History: Rick McGowan, fixes & updates May 2001.
00093     Fixes & updates, Sept 2001.
00094
00095 ------------------------------------------------------------------------ */
00096
00097 //--------------------------------------------------------------------
00113 //--------------------------------------------------------------------
00114
00115 #ifndef __mlConvertUTF_H
00116 #define __mlConvertUTF_H
00117
00118 #ifndef __mlTypeDefs_H
00119 #include "mlTypeDefs.h"
00120 #endif
00121
00122 //--------------------------------------------------------------------
00128 //--------------------------------------------------------------------
00129 typedef MLuint32 UTF32;
00130 typedef MLuint16 UTF16;
00131 typedef MLuint8  UTF8;
00132
00133
00134 //--------------------------------------------------------------------
00136 //--------------------------------------------------------------------
00137 #define UNI_REPLACEMENT_CHAR (static_cast<UTF32>(0x0000FFFD))  //!< Used instead of invalid characters on lenient conversion.
00138 #define UNI_MAX_BMP          (static_cast<UTF32>(0x0000FFFF))
00139 #define UNI_MAX_UTF16        (static_cast<UTF32>(0x0010FFFF))
00140 #define UNI_MAX_UTF32        (static_cast<UTF32>(0x7FFFFFFF))
00141
00142
00143 //--------------------------------------------------------------------
00145 //--------------------------------------------------------------------
00146 typedef enum {
00147   conversionOK,
00148   sourceExhausted,
00149   targetExhausted,
00150   sourceIllegal
00151 } ConversionResult;
00152
00153 //--------------------------------------------------------------------
00155 //--------------------------------------------------------------------
00156 typedef enum {
00157   strictConversion = 0,
00158   lenientConversion
00159 } ConversionFlags;
00160
00161 //--------------------------------------------------------------------
00162 /* This is for C++ and does no harm in C */
00163 //--------------------------------------------------------------------
00164 #ifdef __cplusplus
00165 extern "C" {
00166 #endif
00167
00168 //---------------------------------------
00170
00171 //---------------------------------------
00172
00176 ConversionResult ConvertUTF32toUTF16 (const UTF32   **sourceStart,
00177                                       const UTF32    *sourceEnd,
00178                                       UTF16         **targetStart,
00179                                       UTF16          *targetEnd,
00180                                       ConversionFlags flags);
00181
00185 ConversionResult ConvertUTF16toUTF32 (const UTF16   **sourceStart,
00186                                       const UTF16    *sourceEnd,
00187                                       UTF32         **targetStart,
00188                                       UTF32          *targetEnd,
00189                                       ConversionFlags flags);
00190
00194 ConversionResult ConvertUTF16toUTF8 ( const UTF16   **sourceStart,
00195                                       const UTF16    *sourceEnd,
00196                                       UTF8          **targetStart,
00197                                       UTF8           *targetEnd,
00198                                       ConversionFlags flags);
00199
00203 ConversionResult ConvertUTF8toUTF16 ( const UTF8    **sourceStart,
00204                                       const UTF8     *sourceEnd,
00205                                       UTF16         **targetStart,
00206                                       UTF16          *targetEnd,
00207                                       ConversionFlags flags);
00208
00212 ConversionResult ConvertUTF32toUTF8  (const UTF32   **sourceStart,
00213                                       const UTF32    *sourceEnd,
00214                                       UTF8          **targetStart,
00215                                       UTF8           *targetEnd,
00216                                       ConversionFlags flags);
00217
00221 ConversionResult ConvertUTF8toUTF32 ( const UTF8    **sourceStart,
00222                                       const UTF8     *sourceEnd,
00223                                       UTF32         **targetStart,
00224                                       UTF32          *targetEnd);
00225
00226
00227
00228 //---------------------------------------
00230
00231 //---------------------------------------
00236 ConversionResult ConvertUTF8toLatin1 (const UTF8     *sourceStart,
00237                                       char           *targetStart,
00238                                       char           *targetEnd,
00239                                       ConversionFlags flags);
00240
00246 ConversionResult CalculateNumCharsInUTF8 (const UTF8     *sourceStart,
00247                                           unsigned int   *length,
00248                                           ConversionFlags flags);
00249
00256 ConversionResult CalculateUTF16BufferSizeForUTF8 (const UTF8     *sourceStart,
00257                                                   unsigned int   *length,
00258                                                   ConversionFlags flags);
00260
00261
00262
00263 #ifdef __cplusplus
00264 }
00265 #endif
00266
00267 /* --------------------------------------------------------------------- */
00268
00269 #endif // __mlConvertUTF_H
00270
00271