mi32/ucstring.h

Go to the documentation of this file.
00001 /**
00002  * \file mi32/ucstring.h
00003  * \brief Definitions for Unicode string functions and related text conversion
00004  *
00005  * \if NODOC
00006  * $Id: ucstring.h_v 1.126 2003/10/06 17:05:12 mju Exp $
00007  *
00008  * $Log: ucstring.h_v $
00009  * Revision 1.126  2003/10/06 17:05:12  mju
00010  * Remove obsolete prototype.
00011  *
00012  * Revision 1.125  2003/10/03 20:05:23  linux32build!build
00013  * Doxygen
00014  *
00015  * Revision 1.124  2003/09/15 16:33:00  scowan
00016  * removed mac native code.
00017  *
00018  * Revision 1.123  2003/09/15 13:49:56  fileserver!dwilliss
00019  * Doxygen
00020  *
00021  * Revision 1.122  2003/07/09 14:27:09  mju
00022  * Don't define MISYSTEMclassexport at all as not needed.
00023  * Move libexport stuff to after all headers.
00024  *
00025  * Revision 1.121  2003/07/09 14:04:57  dwilliss
00026  * Don't leave LIBEXPORT defined after the end of the file
00027  *
00028  * Revision 1.120  2003/06/30 16:03:29  mju
00029  * Remove mttext fn to get from textid.
00030  *
00031  * Revision 1.119  2003/02/07 15:39:53  dwilliss
00032  * Added a prototype for a new function
00033  *
00034  * Revision 1.118  2002/09/11 17:01:23  dwilliss
00035  * Added KOI8 encoding
00036  *
00037  * Revision 1.117  2002/09/09 16:35:42  mju
00038  * Remove varargs fns.
00039  * \endif
00040 **/
00041 
00042 #ifndef INC_MI32_UCSTRING_H
00043 #define INC_MI32_UCSTRING_H
00044 
00045 
00046 #ifndef UCSTRING_H
00047 #define UCSTRING_H
00048 
00049 #ifndef INC_MI32_STDDEFNS_H
00050 #include <mi32/stddefns.h>
00051 #endif
00052 
00053 #ifndef INC_MI32_TEXTID_H
00054 #include <mi32/textid.h>
00055 #endif
00056 
00057 #include <stdio.h>
00058 
00059 #if defined(SUN) && defined(BSD)
00060 
00061 #include <varargs.h>
00062 #include <sys/stdtypes.h>
00063 
00064 #else
00065 
00066 #include <stdarg.h>
00067 
00068 #endif
00069 
00070 #ifdef MISYSTEMDLL
00071    #define LIBEXPORT MI_DLLEXPORT
00072 #else
00073    #define LIBEXPORT MI_DLLIMPORT
00074 #endif
00075 
00076 //! Structure used to keep track of state for calling
00077 //! MucKeyEventToUnicode()
00078 struct MUCEVENTSTATE {
00079    int  compose_mode;
00080    int  initialized;
00081    int  compose_byte1;
00082    int  compose_len;
00083    FILE  *debug;
00084    int  KeyboardCharsetLower128;
00085    int  KeyboardCharsetUpper128;
00086    int  KeyboardCharset2Byte;
00087    char compose_status[40];
00088    UINT8 compose_temp[80];
00089    };
00090 
00091 #if defined(__cplusplus)
00092 extern "C" {
00093 #endif
00094 #ifdef _XLIB_H_
00095 
00096 //! Convert XKeyEvents to UNICODE
00097 //!
00098 //! Nothing uses this. It may not even work currently.
00099 int MucKeyEventToUnicode (XEvent*, MUCEVENTSTATE*, UINT32);
00100 #endif
00101 #if defined(__cplusplus)
00102 }
00103 #endif
00104 
00105 #define MUCVERSION   2
00106 #ifndef GENERATING_DOXYGEN_OUTPUT
00107 /* Structure of the header of unicode2.ref.  All numbers are stored
00108  * in LoHi byte order in unicode2.ref.  HiLo machines will have to
00109  * swap
00110  */
00111 struct MUCFILEHDR {
00112    UINT32 FileHdrSize;     //!<  Size of this structure 
00113    UINT32 EncHdrSize;         //!<  sizeof(MUCENCODEHDR) 
00114    UINT32 Version;            //!<  MUCVERSION 
00115    UINT32 NumEncodings;    //!<  Number of MUCENCODEHDRs that follow 
00116    };
00117 
00118 struct MUCENCODEHDR {
00119    UNICODE name[16]; 
00120    INT32 offset;           //!<  Offset into file of data for this encoding 
00121    INT32 numpairs;
00122    };
00123 #endif // GENERATING_DOXYGEN_OUTPUT
00124 
00125 /*
00126 ** Constants for MucCharsetToUnicode()/MucUnicodeToCharset()
00127 */
00128 #define MASK_ALG                 0x7FFF0000
00129 #define MASK_CHARSET             0x0000FFFF
00130 
00131 #define CHARALG_SJIS             0x00010000
00132 #define CHARALG_EUC              0x00020000
00133 #define CHARALG_GL2GR            0x00040000  //!< Set high bits
00134 #define CHARALG_GR2GL            0x00080000  //!< Strip high bits
00135 
00136 #define CHARSET_Unicode          0x00000000
00137 #define CHARSET_GB_2312          0x00000001
00138 #define CHARSET_GB_12345         0x00000002
00139 #define CHARSET_GB_7589          0x00000003
00140 #define CHARSET_GB_7590          0x00000004
00141 #define CHARSET_GB_Han           0x00000005
00142 #define CHARSET_GB_8565          0x00000006
00143 #define CHARSET_BigFive          0x00000007
00144 #define CHARSET_CNS_11543_01     0x00000008
00145 #define CHARSET_CNS_11543_02     0x00000009
00146 #define CHARSET_CNS_11543_14     0x0000000A
00147 #define CHARSET_JIS_X_0208_1990  0x0000000B
00148 #define CHARSET_JIS_X_0212_1990  0x0000000C
00149 #define CHARSET_KS_C_5601_1987   0x0000000D  //!< Wansung
00150 #define CHARSET_KS_C_5657_1991   0x0000000E
00151 #define CHARSET_ISO_8859_1       0x0000000F      //!<  ASCII 
00152 #define CHARSET_ISO_8859_2       0x00000010     
00153 #define CHARSET_ISO_8859_3       0x00000011    
00154 #define CHARSET_ISO_8859_4       0x00000012  
00155 #define CHARSET_ISO_8859_5       0x00000013
00156 #define CHARSET_ISO_8859_6       0x00000014
00157 #define CHARSET_ISO_8859_7       0x00000015
00158 #define CHARSET_ISO_8859_8       0x00000016
00159 #define CHARSET_ISO_8859_9       0x00000017
00160 #define CHARSET_JIS_ROMAN        0x00000018
00161 #define CHARSET_GB_ROMAN         0x00000019  //!<  From ISO 2022 
00162 #define CHARSET_JIS_C_6226       0x0000001A  //!<  From ISO 2022 
00163 #define CHARSET_HW_KATAKANA      0x0000001B  //!<  From ISO 2022 
00164 #define CHARSET_WinCP_Turk       0x0000001C  //!<  Windows Codepage (cp1254)
00165 #define CHARSET_WinCP_EE         0x0000001D  //!<  Windows Codepage (cp1250)
00166 #define CHARSET_WinCP_Cyrl       0x0000001E  //!<  Windows Codepage (cp1251)
00167 #define CHARSET_WinCP_Greek      0x0000001F  //!<  Windows Codepage (cp1253)
00168 #define CHARSET_WinCP_Arab       0x00000020  //!<  Windows Codepage (cp1256)
00169 #define CHARSET_WinCP_Hebr       0x00000021  //!<  Windows Codepage (cp1255)
00170 #define CHARSET_DOSCP_437        0x00000022  //!<  DOS Codepage 
00171 #define CHARSET_DOSCP_850        0x00000023  //!<  DOS Codepage 
00172 #define CHARSET_DOSCP_852        0x00000024  //!<  DOS Codepage 
00173 #define CHARSET_DOSCP_857        0x00000025  //!<  DOS Codepage 
00174 #define CHARSET_DOSCP_861        0x00000026  //!<  DOS Codepage 
00175 #define CHARSET_DOSCP_863        0x00000027  //!<  DOS Codepage 
00176 #define CHARSET_DOSCP_865        0x00000028  //!<  DOS Codepage 
00177 #define CHARSET_IBMCP_869        0x00000029  //!<  IBM Codepage 
00178 #define CHARSET_IBMCP_855        0x0000002A  //!<  IBM Codepage 
00179 #define CHARSET_IBMCP_864        0x0000002B  //!<  IBM Codepage 
00180 #define CHARSET_IBMCP_1040       0x0000002C  //!<  IBM Codepage 
00181 #define CHARSET_IBMCP_1041       0x0000002D  //!<  IBM Codepage 
00182 #define CHARSET_IBMCP_1043       0x0000002E  //!<  IBM Codepage 
00183 #define CHARSET_TIS620_2529      0x0000002F  //!<  Thai (BDF font order) 
00184 #define CHARSET_WinCP_Thai       0x00000030  //!<  Thai Windows ordering 
00185 #define CHARSET_MI_Thai          0x00000031  //!<  Thai Windows ordering 
00186 #define CHARSET_WinCP_ANSI       0x00000032  //!<  Windows ANSI Code Page 
00187 #define CHARSET_ArabicUC         0x00000033  //!<  The Arabic range of Unicode 
00188 #define CHARSET_MacArabic        0x00000034
00189 #define CHARSET_MacCroatian      0x00000035
00190 #define CHARSET_MacCyrillic      0x00000036
00191 #define CHARSET_MacDingbats      0x00000037
00192 #define CHARSET_MacCentEurope    0x00000038
00193 #define CHARSET_MacGreek         0x00000039
00194 #define CHARSET_MacHebrew        0x0000003A
00195 #define CHARSET_MacIcelandic     0x0000003B
00196 #define CHARSET_MacJapanese      0x0000003C
00197 #define CHARSET_MacRomanian      0x0000003D
00198 #define CHARSET_MacRoman         0x0000003E
00199 #define CHARSET_MacSymbol        0x0000003F
00200 #define CHARSET_MacThai          0x00000040
00201 #define CHARSET_MacTurkish       0x00000041
00202 #define CHARSET_MacUkrainian     0x00000042
00203 #define CHARSET_WinCP_Baltic     0x00000043
00204 #define CHARSET_IBM_1046         0x00000044  //!<  IBM Arabic encoding 
00205 #define CHARSET_KS_C_5601_Unif   0x00000045  //!< Unified Hangul (UHang)
00206 #define CHARSET_KS_C_5601_1992   0x00000046  //!< Johab
00207 #define CHARSET_ISCII_Devanagari 0x00000047  //!< Indic
00208 #define CHARSET_ISCII_Bengali    0x00000048  //!< Indic
00209 #define CHARSET_ISCII_Gurmukhi   0x00000049  //!< Indic
00210 #define CHARSET_ISCII_Gujarati   0x0000004A  //!< Indic
00211 #define CHARSET_ISCII_Oriya      0x0000004B  //!< Indic
00212 #define CHARSET_ISCII_Tamil      0x0000004C  //!< Indic
00213 #define CHARSET_ISCII_Telugu     0x0000004D  //!< Indic
00214 #define CHARSET_ISCII_Kannada    0x0000004E  //!< Indic
00215 #define CHARSET_ISCII_Malayalam  0x0000004F  //!< Indic
00216 #define CHARSET_KOI8             0x00000050  //!< Russian/Ukranian/Etc.
00217 #define CHARSET_MAX              0x00000050
00218 
00219 /* Some aliases... */
00220 #define CHARSET_ISO_Latin_1      CHARSET_ISO_8859_1   
00221 #define CHARSET_ASCII            CHARSET_ISO_Latin_1
00222 #define CHARSET_JIS_0208         CHARSET_JIS_X_0208_1990
00223 #define CHARSET_JIS_0212         CHARSET_JIS_X_0212_1990
00224 #define CHARSET_SHIFT_JIS        (CHARALG_SJIS | CHARSET_JIS_X_0208_1990)
00225 
00226 #define UCCONV_NoByteOrderMark   0x00000001
00227 
00228 //!
00229 //! A character encoding exactly specifies both the set(s) of characters used
00230 //! and how they are stored in memory (assuming native byte order).  Note that
00231 //! this is *not* the same as a CHARSET.  Once an encoding value is assigned
00232 //! it must never be changes as the value may be stored in a data file and is
00233 //! also passed to the misystem DLL.
00234 //!
00235 //! These values should NOT be considered sequential!
00236 
00237 #ifndef __cplusplus
00238 typedef
00239 #endif
00240 enum CHAR_ENCODING {
00241    //! Base encodings
00242    CHAR_ENCODING_ASCII = 0,            //!< ASCII
00243    CHAR_ENCODING_Unicode,              //!< 2 byte encoding
00244    CHAR_ENCODING_UTF8,
00245    CHAR_ENCODING_XResFile,             //!< Variation on ISO-2022
00246    CHAR_ENCODING_Unicode_Decomposed,   //!< 2 byte encoding, but things like O with Umlat converted to 2 chars, and O and an Umlat
00247    CHAR_ENCODING_UTF8_Decomposed,      //!< Unicode_Decomposed converted to UTF8
00248    CHAR_ENCODING_WindowsMultiByte,     //!< WINDOWS ONLY - Uses MultiByteToWideChar/WideCharToMultiByte
00249 
00250    //! ISO Encodings
00251    CHAR_ENCODING_ISO_2022 = 100,
00252    CHAR_ENCODING_ISO_8859_1,
00253    CHAR_ENCODING_ISO_8859_2,
00254    CHAR_ENCODING_ISO_8859_3,
00255    CHAR_ENCODING_ISO_8859_4,
00256    CHAR_ENCODING_ISO_8859_5,
00257    CHAR_ENCODING_ISO_8859_6,
00258    CHAR_ENCODING_ISO_8859_7,
00259    CHAR_ENCODING_ISO_8859_8,
00260    CHAR_ENCODING_ISO_8859_9,
00261 
00262    //! Japanese encodings
00263    CHAR_ENCODING_JIS = 120,      //!< Japanese
00264    CHAR_ENCODING_EUC,            //!< Japeneseucstr
00265    CHAR_ENCODING_SJIS,           //!< Japanese
00266    CHAR_ENCODING_Japanese,       //!< Will gusss which of the 3 Japanese encodings
00267 
00268    //! Chinese encodings
00269    CHAR_ENCODING_Big5 = 130,     //!< Chinese
00270    CHAR_ENCODING_GB_2312,        //!< Chinese
00271 
00272    //! Korean encodings
00273    CHAR_ENCODING_KCS_5601 = 140,       //!< Korean
00274 
00275    //! ISCII (Indic) Encodings
00276    CHAR_ENCODING_ISCII_Devanagari = 150,
00277    CHAR_ENCODING_ISCII_Bengali,
00278    CHAR_ENCODING_ISCII_Gurmukhi,
00279    CHAR_ENCODING_ISCII_Gujarati,
00280    CHAR_ENCODING_ISCII_Oriya,
00281    CHAR_ENCODING_ISCII_Tamil,
00282    CHAR_ENCODING_ISCII_Telugu,
00283    CHAR_ENCODING_ISCII_Kannada,
00284    CHAR_ENCODING_ISCII_Malayalam,
00285    
00286    //! Windows Code Pages
00287    CHAR_ENCODING_WinCP_ANSI = 300,     //!< Most English and Western Europe
00288    CHAR_ENCODING_WinCP_Cyrl,           //!< Russian
00289    CHAR_ENCODING_WinCP_Greek,
00290    CHAR_ENCODING_WinCP_Arabic,
00291    CHAR_ENCODING_WinCP_Thai,
00292    CHAR_ENCODING_WinCP_EE,
00293    CHAR_ENCODING_WinCP_Turk,
00294    CHAR_ENCODING_WinCP_Hebr,
00295 
00296    //! IBM Code Pages
00297    CHAR_ENCODING_IBMCP_855 = 400,
00298    CHAR_ENCODING_IBMCP_869,
00299 
00300    //! DOS Code Pages
00301    CHAR_ENCODING_DOSCP_437 = 500,
00302    CHAR_ENCODING_DOSCP_850,
00303    CHAR_ENCODING_DOSCP_852,
00304    CHAR_ENCODING_DOSCP_857,
00305    CHAR_ENCODING_DOSCP_861,
00306    CHAR_ENCODING_DOSCP_863,
00307    CHAR_ENCODING_DOSCP_865,
00308 
00309    //! Macintosh Encodings
00310    CHAR_ENCODING_MacArabic = 600,
00311    CHAR_ENCODING_MacCroatian,
00312    CHAR_ENCODING_MacCyrillic,
00313    CHAR_ENCODING_MacDingbats,
00314    CHAR_ENCODING_MacCentEurope,
00315    CHAR_ENCODING_MacGreek,
00316    CHAR_ENCODING_MacHebrew,
00317    CHAR_ENCODING_MacIcelandic,
00318    CHAR_ENCODING_MacJapanese,
00319    CHAR_ENCODING_MacRomanian,
00320    CHAR_ENCODING_MacRoman,
00321    CHAR_ENCODING_MacSymbol,
00322    CHAR_ENCODING_MacThai,
00323    CHAR_ENCODING_MacTurkish,
00324    CHAR_ENCODING_MacUkrainian,
00325 
00326    //! Misc Code Pages
00327    CHAR_ENCODING_KOI8 = 700
00328 
00329    }
00330 #ifndef __cplusplus
00331    CHAR_ENCODING
00332 #endif
00333    ;
00334 
00335 //! Enums which can be used when getting  the list of encodings
00336 #ifndef __cplusplus
00337 typedef
00338 #endif
00339 enum ENCODELIST_FLAGS {
00340    ENCODELIST_FLAG_Any        = 0x0000,
00341    ENCODELIST_FLAG_Importable = 0x0001,
00342    ENCODELIST_FLAG_Exportable = 0x0002,
00343    ENCODELIST_FLAG_Both       = 0x0003,      //!< Importable | Exportable
00344    ENCODELIST_FLAG_NoUnicode  = 0x0004
00345    }
00346 #ifndef __cplusplus
00347    ENCODELIST_FLAGS
00348 #endif
00349    ;
00350 
00351 #ifdef __cplusplus
00352 //! Following only defined for C++
00353 //! Don't add a semicolon to the end of this!
00354 DEFINE_ENUM_OPERATORS(ENCODELIST_FLAGS)
00355 #endif
00356 
00357 #ifndef __cplusplus
00358 typedef
00359 #endif
00360 //! Script tags
00361 //! These are used mostly for when multiple "scripts" (as in writing systems)
00362 //! use the same character set but have different rules.  A good example of 
00363 //! this is the Indic languages which mostly share the ISCII character set
00364 //! but Devanagari and Bengali have different formatting rules.
00365 //! The numbers below are not random gibberish as it may seem.  If you take
00366 //! each pair of hex digits, they form an ASCII value so that together they
00367 //! make up a 4 letter abbreviation.  This is how these tags are encoded in
00368 //! TrueType fonts.  I would have #defined them as 'arab' instead of a big
00369 //! number if I thought all compilers that we support would accept that.
00370 enum SCRIPTTAG {
00371 
00372     SCRIPTTAG_arab = 0x61726162, //!< 'arab' *** Arabic
00373     SCRIPTTAG_armn = 0x61726D6E, //!< 'armn' *** Armenian: no MS definition ***
00374     SCRIPTTAG_beng = 0x62656E67, //!< 'beng' *** Bengali (Indic)
00375     SCRIPTTAG_bpmf = 0x62706D66, //!< 'bpmf' *** Bopomofo: no MS definition ***
00376     SCRIPTTAG_cyrl = 0x6379726C, //!< 'cyrl' *** Cyrillic
00377     SCRIPTTAG_deva = 0x64657661, //!< 'deva' *** Devanagari (Indic)
00378     SCRIPTTAG_grek = 0x6772656B, //!< 'grek' *** Greek
00379     SCRIPTTAG_grgn = 0x6772676E, //!< 'grgn' *** Georgian: no MS definition ***
00380     SCRIPTTAG_gujr = 0x67756A72, //!< 'gujr' *** Gujarati (Indic)
00381     SCRIPTTAG_hang = 0x68616E67, //!< 'hang' *** 
00382     SCRIPTTAG_hani = 0x68616E69, //!< 'hani' *** 
00383     SCRIPTTAG_hebr = 0x68656272, //!< 'hebr' *** Hebrew
00384     SCRIPTTAG_kana = 0x6B616E61, //!< 'kana' *** 
00385     SCRIPTTAG_knbn = 0x6B6E626E, //!< 'knbn' *** Kanbun: no MS definition ***
00386     SCRIPTTAG_knda = 0x6B6E6461, //!< 'knda' *** Kannada (Indic)
00387     SCRIPTTAG_laoS = 0x6C616F20, //!< 'lao ' *** Lao: no MS definition ***
00388     SCRIPTTAG_latn = 0x6C61746E, //!< 'latn' *** Latin
00389     SCRIPTTAG_mlym = 0x6D6C796D, //!< 'mlym' *** Malayalam (Indic)
00390     SCRIPTTAG_orya = 0x6F727961, //!< 'orya' *** Oriya (Indic)
00391     SCRIPTTAG_punj = 0x70756E6A, //!< 'punj' *** punjabi == gurmukhi
00392     SCRIPTTAG_taml = 0x74616D6C, //!< 'taml' *** Tamil (Indic)
00393     SCRIPTTAG_telu = 0x74656C75, //!< 'telu' *** Telugu (Indic)
00394     SCRIPTTAG_thai = 0x74686169, //!< 'thai' *** Thai
00395     SCRIPTTAG_tibt = 0x74696174, //!< 'tibt' *** Tibetan
00396       
00397     SCRIPTTAG_neut = 0x4E455554, //!< 'NEUT'
00398     SCRIPTTAG_puse = 0x50555345, //!< 'PUSE'
00399     SCRIPTTAG_spcl = 0x5350434C, //!< 'SPCL'
00400     SCRIPTTAG_surr = 0x53555252, //!< 'SURR'
00401 
00402     SCRIPTTAG_Default = 0x00000000 //!< ''
00403    }
00404 #ifndef __cplusplus
00405    SCRIPTTAG
00406 #endif
00407    ;
00408 
00409 
00410 
00411 //!
00412 //!   Some special glyph flags.  The method we use to generate GLYPHCONTEXTBITs
00413 //!   requires that PartOfRtoLWord be 1 and that we don't use bits 0x02 and 0x04
00414 //! here.  The PartOfRtoLWord flag will be shifted and OR'd with the bits of 
00415 //! the glyphs on either side of it to create GLYPHCONTEXT bits.  
00416 //! Yes, they're bytes, not longs.  Don't muck with `em!
00417 //!
00418 #define MucGLYPHFLAG_PartOfRtoLWord    0x01
00419 #define MucGLYPHFLAG_NonJoining        0x80
00420 #define MucGLYPHFLAG_Mark              0x40  
00421 
00422 #define MucGLYPHDIRECTION_LtoR         0
00423 #define MucGLYPHDIRECTION_RtoL         1
00424 #define MucGLYPHDIRECTION_Weak         2
00425 
00426 
00427 //! These are the general category codes from column 2 of the
00428 //! UnicodeData.txt.  The numbers are arbitrarily assigned by
00429 //! me.  UnicodeData.txt has the 2-letter codes.
00430 //! You may be asking why I defined the values for all the
00431 //! enums when the compiler would do that for me...See, this
00432 //! way it would be a pain to insert something into the middle
00433 //! and renumber them, wouldn't it?  Good!  don't do that.
00434 //! ucdata.ref has the numbers and if you renumber them, you'll
00435 //! break things.
00436 enum UCCAT {
00437    UCCAT_Cn = 0,        //!< Other , Not assigned (norm)
00438    UCCAT_Lu = 1,        //!< Letter, Uppercase (norm)
00439    UCCAT_Ll = 2,        //!< Letter, Lowercase (norm)
00440    UCCAT_Lt = 3,        //!< Letter, Titlecase (norm)
00441    UCCAT_Lm = 4,        //!< Letter, Modifier
00442    UCCAT_Lo = 5,        //!< Letter, Other
00443    UCCAT_Mn = 6,        //!< Mark, non-spacing (norm)
00444    UCCAT_Mc = 7,        //!< Mark, spacing combining (norm)
00445    UCCAT_Me = 8,        //!< Mark, encolsing (norm)
00446    UCCAT_Nd = 9,        //!< Number Decimal digit (norm)
00447    UCCAT_Nl = 10,       //!< Number Letter (norm)
00448    UCCAT_No = 11,       //!< Number Other (norm)
00449    UCCAT_Zs = 12,       //!< Separator, space (norm)
00450    UCCAT_Zl = 13,       //!< Separator, line (norm)
00451    UCCAT_Zp = 14,       //!< Separator, paragraph (norm)
00452    UCCAT_Cc = 15,       //!< Other , control (norm)
00453    UCCAT_Cf = 16,       //!< Other , format (norm)
00454    UCCAT_Cs = 17,       //!< Other , surrogate (norm)
00455    UCCAT_Co = 18,       //!< Other , Private use (norm)
00456    UCCAT_Pc = 20,       //!< Punctuation, connector
00457    UCCAT_Pd = 21,       //!< Punctuation, Dash
00458    UCCAT_Ps = 22,       //!< Punctuation, Open
00459    UCCAT_Pe = 23,       //!< Punctuation, Close
00460    UCCAT_Pi = 24,       //!< Punctuation, Initial quote
00461    UCCAT_Pf = 25,       //!< Punctuation, final quote
00462    UCCAT_Po = 26,       //!< Punctuation, other
00463    UCCAT_Sm = 27,       //!< Symbol, math
00464    UCCAT_Sc = 28,       //!< Symbol, currency
00465    UCCAT_Sk = 29,       //!< Symbol, modifier
00466    UCCAT_So = 30        //!< Symbol, other
00467    };
00468 
00469 //! Flags for the above which can be ORd together
00470 enum UCCATFLAGS {
00471    UCCATFLAG_Cn = (1 << UCCAT_Cn),        //! Other , Not assigned (norm)
00472    UCCATFLAG_Lu = (1 << UCCAT_Lu),        //! Letter, Uppercase (norm)
00473    UCCATFLAG_Ll = (1 << UCCAT_Ll),        //! Letter, Lowercase (norm)
00474    UCCATFLAG_Lt = (1 << UCCAT_Lt),        //! Letter, Titlecase (norm)
00475    UCCATFLAG_Lm = (1 << UCCAT_Lm),        //! Letter, Modifier
00476    UCCATFLAG_Lo = (1 << UCCAT_Lo),        //! Letter, Other
00477    UCCATFLAG_Mn = (1 << UCCAT_Mn),        //! Mark, non-spacing (norm)
00478    UCCATFLAG_Mc = (1 << UCCAT_Mc),        //! Mark, spacing combining (norm)
00479    UCCATFLAG_Me = (1 << UCCAT_Me),        //! Mark, encolsing (norm)
00480    UCCATFLAG_Nd = (1 << UCCAT_Nd),        //! Number Decimal digit (norm)
00481    UCCATFLAG_Nl = (1 << UCCAT_Nl),        //! Number Letter (norm)
00482    UCCATFLAG_No = (1 << UCCAT_No),        //! Number Other (norm)
00483    UCCATFLAG_Zs = (1 << UCCAT_Zs),        //! Separator, space (norm)
00484    UCCATFLAG_Zl = (1 << UCCAT_Zl),        //! Separator, line (norm)
00485    UCCATFLAG_Zp = (1 << UCCAT_Zp),        //! Separator, paragraph (norm)
00486    UCCATFLAG_Cc = (1 << UCCAT_Cc),        //! Other , control (norm)
00487    UCCATFLAG_Cf = (1 << UCCAT_Cf),        //! Other , format (norm)
00488    UCCATFLAG_Cs = (1 << UCCAT_Cs),        //! Other , surrogate (norm)
00489    UCCATFLAG_Co = (1 << UCCAT_Co),        //! Other , Private use (norm)
00490    UCCATFLAG_Pc = (1 << UCCAT_Pc),        //! Punctuation, connector
00491    UCCATFLAG_Pd = (1 << UCCAT_Pd),        //! Punctuation, Dash
00492    UCCATFLAG_Ps = (1 << UCCAT_Ps),        //! Punctuation, Open
00493    UCCATFLAG_Pe = (1 << UCCAT_Pe),        //! Punctuation, Close
00494    UCCATFLAG_Pi = (1 << UCCAT_Pi),        //! Punctuation, Initial quote
00495    UCCATFLAG_Pf = (1 << UCCAT_Pf),        //! Punctuation, final quote
00496    UCCATFLAG_Po = (1 << UCCAT_Po),        //! Punctuation, other
00497    UCCATFLAG_Sm = (1 << UCCAT_Sm),        //! Symbol, math
00498    UCCATFLAG_Sc = (1 << UCCAT_Sc),        //! Symbol, currency
00499    UCCATFLAG_Sk = (1 << UCCAT_Sk),        //! Symbol, modifier
00500    UCCATFLAG_So = (1 << UCCAT_So),        //! other
00501    UCCATFLAG_Mi = (1 << 31)               //! Mirroring
00502    };
00503 #ifdef __cplusplus
00504 //! Following only defined for C++
00505 //! Don't add a semicolon to the end of this!
00506 DEFINE_ENUM_OPERATORS(UCCATFLAGS)
00507 #endif
00508       
00509 //! Unicode Bidirectional category from column 4 of UnicodeData.txt
00510 //! See details on UCCATEGORY enum above.  These are summarized in
00511 //! chapter 3 of the Unicode Standard
00512 enum BIDITAG {
00513    BIDITAG_L   = 0,        //!< Left-to-right
00514    BIDITAG_LRE = 1,        //!< Left-to-right Embedding
00515    BIDITAG_LRO = 2,        //!< Left-to-right Override
00516    BIDITAG_R   = 3,        //!< Right-to-left
00517    BIDITAG_AL  = 4,        //!< Right-to-left arabic
00518    BIDITAG_RLE = 5,        //!< Right-to-left embedding
00519    BIDITAG_RLO = 6,        //!< Right-to-left override
00520    BIDITAG_PDF = 7,        //!< Pop Directional Format
00521    BIDITAG_EN  = 8,        //!< European Number
00522    BIDITAG_ES  = 9,        //!< European Number separator
00523    BIDITAG_ET  = 10,       //!< European Number Terminator
00524    BIDITAG_AN  = 11,       //!< Arabic Number 
00525    BIDITAG_CS  = 12,       //!< Common Number Separator
00526    BIDITAG_NSM = 13,       //!< Non Spacing Mark
00527    BIDITAG_BN  = 14,       //!< Boundary Neutral
00528    BIDITAG_B   = 15,       //!< Paragraph Separator
00529    BIDITAG_WS  = 16,       //!< White Space
00530    BIDITAG_ON  = 17        //!< Other Neutrals
00531    };
00532 
00533 //! Flags for the above which can be ORd together
00534 enum BIDIFLAGS {
00535    BIDIFLAG_L     = (1 << BIDITAG_L),        //! Left-to-right
00536    BIDIFLAG_LRE   = (1 << BIDITAG_LRE),      //! Left-to-right Embedding
00537    BIDIFLAG_LRO   = (1 << BIDITAG_LRO),      //! Left-to-right Override
00538    BIDIFLAG_R     = (1 << BIDITAG_R),        //! Right-to-left
00539    BIDIFLAG_AL    = (1 << BIDITAG_AL),       //! Right-to-left arabic
00540    BIDIFLAG_RLE   = (1 << BIDITAG_RLE),      //! Right-to-left embedding
00541    BIDIFLAG_RLO   = (1 << BIDITAG_RLO),      //! Right-to-left override
00542    BIDIFLAG_PDF   = (1 << BIDITAG_PDF),      //! Pop Directional Format
00543    BIDIFLAG_EN    = (1 << BIDITAG_EN),       //! European Number
00544    BIDIFLAG_ES    = (1 << BIDITAG_ES),       //! European Number separator
00545    BIDIFLAG_ET    = (1 << BIDITAG_ET),       //! European Number Terminator
00546    BIDIFLAG_AN    = (1 << BIDITAG_AN),       //! Arabic Number 
00547    BIDIFLAG_CS    = (1 << BIDITAG_CS),       //! Common Number Separator
00548    BIDIFLAG_NSM   = (1 << BIDITAG_NSM),      //! Non Spacing Mark
00549    BIDIFLAG_BN    = (1 << BIDITAG_BN),       //! Boundary Neutral
00550    BIDIFLAG_B     = (1 << BIDITAG_B),        //! Paragraph Separator
00551    BIDIFLAG_WS    = (1 << BIDITAG_WS),       //! White Space
00552    BIDIFLAG_ON    = (1 << BIDITAG_ON)        //! Other Neutrals
00553    };
00554 
00555 #ifdef __cplusplus
00556 //! Following only defined for C++
00557 //! Don't add a semicolon to the end of this!
00558 DEFINE_ENUM_OPERATORS(BIDIFLAGS)
00559 #endif
00560 
00561 //! Decompitition tag.  Column 5 of UnicodeData.txt sometimes has a tag
00562 //! in angle brackets, (e.g. "<Isolated>")  This value represents that tag.
00563 //! For example, 0xFE90 "ARABIC LETTER BEH FINAL FORM" has, in the 5th
00564 //! column "<final> 0628" to which tells us that this glyph is the 
00565 //! end-of-word form of the ARABIC LETTER BEH, which is at 0x0628
00566 //! See Chapter 4 of the Unicode Standard.
00567 enum UCDECOMP {
00568    UCDECOMP_NoTag       = 0,
00569    UCDECOMP_Initial     = 1,
00570    UCDECOMP_Medial      = 2,
00571    UCDECOMP_Final       = 3,
00572    UCDECOMP_Isolated    = 4,
00573    UCDECOMP_Compat      = 5,
00574    UCDECOMP_Wide        = 6,
00575    UCDECOMP_Narrow      = 7,
00576    UCDECOMP_Fraction    = 8,
00577    UCDECOMP_Subscript   = 9,
00578    UCDECOMP_Superscript = 10,
00579    UCDECOMP_Small       = 11, //!< A small veriant form (CNS compatibility)
00580    UCDECOMP_Square      = 12, //!< A CJK squared font variant
00581    UCDECOMP_Circle      = 13, //!< An encircled form
00582    UCDECOMP_Font        = 14, //!< A font variant (e.g. a blackletter form)
00583    UCDECOMP_NoBreak     = 15, //!< A non-breaking form of a space of hyphen
00584    UCDECOMP_Vertical    = 16  //!< A vertical layout presentation form
00585    };
00586 
00587 #define UCDECOMPFLAG_Initial     (1 << UCDECOMP_Initial)
00588 #define UCDECOMPFLAG_Medial      (1 << UCDECOMP_Medial)
00589 #define UCDECOMPFLAG_Final       (1 << UCDECOMP_Final)
00590 #define UCDECOMPFLAG_Isolated    (1 << UCDECOMP_Isolated)
00591 #define UCDECOMPFLAG_PositionMask (UCDECOMPFLAG_Initial | UCDECOMPFLAG_Medial | UCDECOMPFLAG_Final | UCDECOMPFLAG_Isolated)
00592 
00593 #ifdef __cplusplus
00594 
00595 //! Structure representing "cooked" entry from UnicodeData.txt.
00596 //! (see www.unicode.org).  Note, 3 of the values in this structure
00597 //! could be enums, but I want them to take as little space as
00598 //! possible, and I can't guarentee that enum will be a byte on
00599 //! all platforms.  (have to be sure, because this struct is read
00600 //! out of a file.
00601 struct UCDATA {
00602    //! The actual Unicode value
00603    UNICODE  ucval;
00604 
00605    //! If cagegory == UCCAT_Lu, (Letter, Uppercase), this will
00606    //! be the Unicode value of the lowercase version of the same
00607    //! letter.  If category == UCCAT_Ll (Letter, Lowercase) this
00608    //! will be the uppercase version of the same letter
00609    UNICODE  altcase;
00610 
00611    //! Unicode value of the first composite character that starts
00612    //! with this Unicode value.  For example, Unicode value 0x0627
00613    //! (ARABIC LETTER ALEF) might have a firstcomp of 0x0626
00614    //! (ARABIC LETTER ALEF WITH HAMZA ABOVE).
00615    //! Due to the recursive nature of composition, this can't be
00616    //! done as a single linked list.  
00617    //! ARABIC LETTER ALEF WITH HAMZA ABOVE might have a nextcomp
00618    //! of ARABIC LETTER ALEF WITH HAMZA BELOW (next in the chain of
00619    //! things that started with ARABIC LETTER ALEF.  But it might
00620    //! have a firstcomp of ARABIC LETTER ALEF WITH HAMZA ABOVE FINAL 
00621    //! FORM (beginning a chain of things starting with ARABIC LETTER
00622    //! ALEF WITH HAMZA ABOVE)
00623    UNICODE firstcomp;
00624 
00625    //! Unicode value of the first composite character that starts
00626    //! with this Unicode value.  For example, Unicode value 0x00E0
00627    //! (ARABIC SMALL LETTER A WITH GRAVE) might have a nextcomp of 
00628    //! 0x00E1 (ARABIC SMALL LETTER A WITH ACUTE).  
00629    UNICODE nextcomp;
00630 
00631    //! General character category.  This is one of the UCCAT
00632    //! enums defined below.  The enum is based on the two letter
00633    //! code from UnicodeData.txt, so if UnicodeData.txt says a
00634    //! character is Lo (Letter, Other), the enum is UCCAT_Lo.
00635    UINT8    category;
00636 
00637    //! Bidirectional category tag.   Enums defined below and explained
00638    //! in Chapter 3 of the Unicode Standard
00639    UINT8    biditag;
00640 
00641    //! Decomposition tag
00642    UINT8    decomptag;
00643 
00644    //! Number of componants if this is a composite glyph or a
00645    //! variant of the glyph
00646    UINT8    numcomp;
00647 
00648    //! Return the CHARDATA for a given Unicode character.
00649    //! May return NULL if no data exists for a given Unicode char.
00650    static const UCDATA* GetCharData (
00651       UNICODE ucval
00652       );
00653 
00654    //! Get the list of componants for this character
00655    //! If numcomp > 0, the unicode character can be decomposed
00656    //! into that many componants.  This method will get the list
00657    //! of componants, assuming you got the componant by calling
00658    //! UCDATA::GetCharData()
00659    const UNICODE* GetComponants (
00660       ) const {
00661       return (reinterpret_cast<const UNICODE*>(this + 1));
00662       }
00663    };
00664 
00665 
00666 //! Determine of a Unicode character has a given property.
00667 //! The property flags are listed in ucstring.h.
00668 //! (the name is derived from isprop() from ctype.h)
00669 //! There are two sets of flags because there are so many
00670 //! possibilites with Unicode.  You won't usually use this
00671 //! function directly.  
00672 LIBEXPORT bool ucisprop (
00673    UNICODE c,
00674    UINT32 uccatflags
00675    );
00676  
00677 LIBEXPORT bool ucisdecompbidiprop(
00678    UNICODE c,
00679    UINT32 decompflags
00680    );
00681  
00682 LIBEXPORT bool ucisbidiprop (
00683    UNICODE c,
00684    UINT32 bidiflags
00685    );
00686 
00687 LIBEXPORT UNICODE uctolower (
00688    UNICODE c
00689    );
00690 
00691 LIBEXPORT UNICODE uctoupper (
00692    UNICODE c
00693    );
00694 
00695 
00696 
00697 //! Alpha
00698 inline bool ucisalpha (UNICODE c) {
00699    return ucisprop(c, UCCATFLAG_Lu | UCCATFLAG_Ll | UCCATFLAG_Lm | UCCATFLAG_Lo | UCCATFLAG_Lt);
00700    }
00701 
00702 //! Digit (any digit, not nust 0-9)
00703 inline bool ucisdigit2 (UNICODE c) {
00704    return ucisprop(c, UCCATFLAG_Nd);
00705    }
00706 
00707 //! Alpha or Digit
00708 inline bool ucisalnum (UNICODE c) {
00709    return ucisprop(c, UCCATFLAG_Lu | UCCATFLAG_Ll | UCCATFLAG_Lm | UCCATFLAG_Lo | UCCATFLAG_Lt | UCCATFLAG_Nd);
00710    }
00711 
00712 //! Control
00713 inline bool uciscntrl (UNICODE c) {
00714    return ucisprop(c, UCCATFLAG_Cc | UCCATFLAG_Cf);
00715    }
00716 
00717 //! Space
00718 inline bool ucisspace (UNICODE c) {
00719    return ucisprop(c, UCCATFLAG_Zs);
00720    }
00721 
00722 //! Space 
00723 inline bool ucisblank (UNICODE c) {
00724    return ucisprop(c, UCCATFLAG_Zs);
00725    }
00726 
00727 inline bool ucispunct (UNICODE c) {
00728    return ucisprop(c, UCCATFLAG_Pc | UCCATFLAG_Pd | UCCATFLAG_Ps | UCCATFLAG_Pe | UCCATFLAG_Po | UCCATFLAG_Pi | UCCATFLAG_Pf | UCCATFLAG_Po);
00729    }
00730 
00731 inline bool ucisupper (UNICODE c) {
00732    return ucisprop(c, UCCATFLAG_Lu);
00733    }
00734 
00735 inline bool ucislower (UNICODE c) {
00736    return ucisprop(c, UCCATFLAG_Ll);
00737    }
00738 
00739 inline bool ucistitle (UNICODE c) {
00740    return ucisprop(c, UCCATFLAG_Lt);
00741    }
00742 
00743 inline bool ucisisocntrl (UNICODE c) {
00744    return ucisprop(c, UCCATFLAG_Cc);
00745    }
00746 
00747 inline bool ucisfmtcntrl (UNICODE c) {
00748    return ucisprop(c, UCCATFLAG_Cf);
00749    }
00750 
00751 inline bool ucissymbol (UNICODE c) {
00752    return ucisprop(c, UCCATFLAG_Sm | UCCATFLAG_Sc | UCCATFLAG_So | UCCATFLAG_Sk);
00753    }
00754 
00755 //! Number (digit, letter form, other)
00756 inline bool ucisnumber (UNICODE c) {
00757    return ucisprop(c, UCCATFLAG_Nd | UCCATFLAG_No | UCCATFLAG_Nl);
00758    }
00759 
00760 //! Mark, Non spacing
00761 inline bool ucisnonspacing (UNICODE c) {
00762    return ucisprop(c, UCCATFLAG_Mn);
00763    }
00764 
00765 //! Punctuation, Open
00766 inline bool ucisopenpunct (UNICODE c) {
00767    return ucisprop(c, UCCATFLAG_Ps);
00768    }
00769 
00770 //! Punctuation, Close
00771 inline bool ucisclosepunct (UNICODE c) {
00772    return ucisprop(c, UCCATFLAG_Pe);
00773    }
00774 
00775 //! Punctuation, Initial
00776 inline bool ucisinitialpunct (UNICODE c) {
00777    return ucisprop(c, UCCATFLAG_Pi);
00778    }
00779 
00780 //! Punctuation, Final
00781 inline bool ucisfinalpunct (UNICODE c) {
00782    return ucisprop(c, UCCATFLAG_Pf);
00783    }
00784 
00785 //! Strong Right-to-Left Directionality
00786 inline bool ucisrtl (UNICODE c) {
00787    return ucisbidiprop(c, BIDIFLAG_R | BIDIFLAG_AL);
00788    }
00789 
00790 //! Strong Left-to-Right Directionality
00791 inline bool ucisltr (UNICODE c) {
00792    return ucisbidiprop(c, BIDIFLAG_L);
00793    }
00794 
00795 //! Strong R-to-L or L-to-R Directionality
00796 inline bool ucisstrong (UNICODE c) {
00797    return ucisbidiprop(c, BIDIFLAG_L | BIDIFLAG_R | BIDIFLAG_AL);
00798    }
00799 
00800 //! Weak Directionality
00801 inline bool ucisweak (UNICODE c) {
00802    return ucisbidiprop(c, BIDIFLAG_EN | BIDIFLAG_ES | BIDIFLAG_ET | BIDIFLAG_AN | BIDIFLAG_CS);
00803    }
00804 
00805 //! Netutral Directionality
00806 inline bool ucisneutral (UNICODE c) {
00807    return ucisbidiprop(c, BIDIFLAG_WS | BIDIFLAG_ON);
00808    }
00809 
00810 
00811 //! Mark (any)
00812 inline bool ucismark (UNICODE c) {
00813    return ucisprop(c, UCCATFLAG_Mn | UCCATFLAG_Mc | UCCATFLAG_Me);
00814    }
00815 
00816 //! Letter, modifier
00817 inline bool ucismodif (UNICODE c) {
00818    return ucisprop(c, UCCATFLAG_Lm);
00819    }
00820 
00821 //! Punctuation, Connecting
00822 inline bool ucisconnect (UNICODE c) {
00823    return ucisprop(c, UCCATFLAG_Pc);
00824    }
00825 
00826 //! Punctuation, Dash
00827 inline bool ucisdash (UNICODE c) {
00828    return ucisprop(c, UCCATFLAG_Pd);
00829    }
00830 
00831 //! Symbol, math
00832 inline bool ucismath (UNICODE c) {
00833    return ucisprop(c, UCCATFLAG_Sm);
00834    }
00835 
00836 //! Symbol, Currency
00837 inline bool uciscurrency (UNICODE c) {
00838    return ucisprop(c, UCCATFLAG_Sc);
00839    }
00840 
00841 //! Symbol, modifier
00842 inline bool ucismodifsymbol (UNICODE c) {
00843    return ucisprop(c, UCCATFLAG_Sk);
00844    }
00845 
00846 //! Mark, non-spacing
00847 inline bool ucisnsmark (UNICODE c) {
00848    return ucisprop(c, UCCATFLAG_Mn);
00849    }
00850 
00851 //! Mark, spacing
00852 inline bool ucisspmark (UNICODE c) {
00853    return ucisprop(c, UCCATFLAG_Mc);
00854    }
00855 
00856 //! Mark, enclosing
00857 inline bool ucisenclosing (UNICODE c) {
00858    return ucisprop(c, UCCATFLAG_Me);
00859    }
00860 
00861 //! Seperator, Line
00862 inline bool ucislsep (UNICODE c) {
00863    return ucisprop(c, UCCATFLAG_Zl);
00864    }
00865 
00866 //! Seperator, Paragraph
00867 inline bool ucispsep (UNICODE c) {
00868    return ucisprop(c, UCCATFLAG_Zp);
00869    }
00870 
00871 //! Han glyph
00872 inline bool ucishan (UNICODE c) {
00873    return ((c >= 0x4e00 && c <= 0x9fff) || (c >= 0xf900 && c <= 0xfaff));
00874    }
00875 
00876 //! Hangul glyph
00877 inline bool ucishangul (UNICODE c) {
00878    return (c >= 0xac00 && c <= 0xd7ff);
00879    }
00880 
00881 #endif
00882 
00883 
00884 /*----------------------------------------------------------------------------*/
00885 /*    Flags for MtTextGetStringExtUC()                                        */
00886 /*----------------------------------------------------------------------------*/
00887 
00888 #define MTTEXT_NULLIfUnknown  0x00000001     //!<  Return null instead of "unknown message... 
00889 
00890 /*----------------------------------------------------------------------------*/
00891 
00892 #if defined(__cplusplus)
00893 extern "C" {
00894 
00895 
00896 //!:Associate with "MtText funtions"
00897 //!\addtogroup MtText MtText Functions
00898 //!@{
00899 //!\deprecated Use TEXTID instead
00900 
00901 #if !defined(DEPRECATE_GROUPKEY) || defined(MISYSTEMDLL)
00902 //! Read a string from messages.txt
00903 //!
00904 //! \deprecated  Use MISTRING instead.
00905 //!
00906 //! Reads a string, which the caller should free when done using.
00907 //! If the requested message can't be found, it returns a string
00908 //! which says the message couldn't be found.  If you'd rather get
00909 //! a NULL pointer in that case, use MtTextGetStringExtUC()
00910 //!
00911 //! @see MtTextGetStringExtUC()
00912 LIBEXPORT UNICODE* MtTextGetStringUC (
00913    const char* group, 
00914    const char* key
00915    );
00916 #endif
00917 
00918 #if !defined(DEPRECATE_GROUPKEY) || defined(MISYSTEMDLL)
00919 //! Read a string from messages.txt
00920 //!
00921 //! \deprecated  Use MISTRING instead.
00922 //!
00923 //! Reads a string, which the caller should free when done using.
00924 //!
00925 //! @param group
00926 //!   Message group
00927 //! @param key
00928 //!   Message key
00929 //! @param flags
00930 //! \li <b>MTTEXT_NULLIfUnknown</b> Return NULL instead of 
00931 //!   unknown message string
00932 LIBEXPORT UNICODE* MtTextGetStringExtUC (
00933    const char* group, 
00934    const char* key, 
00935    UINT32 flags
00936    );
00937 #endif
00938 
00939 //!@} 
00940 
00941 }  //! Extern "C"
00942 #endif   //!< #if defined C++
00943 
00944 //!:Associate with "Unicode Functions"
00945 //!\addtogroup Unicode Unicode Functions
00946 //!@{
00947 
00948 #if defined(__cplusplus)
00949 extern "C" {
00950 #endif
00951 
00952 //! Convert an ASCII string to Unicode (with length limit).
00953 //!
00954 //! Note, this function doesn't do any character set mapping.  It
00955 //! simply casts all the source characters to (UNICODE)
00956 LIBEXPORT UNICODE* strntouc (
00957    UNICODE *dest, 
00958    const char *source, 
00959    int len
00960    );
00961 
00962 //! Convert an ASCII string to Unicode.
00963 //!
00964 //! Note, this function doesn't do any character set mapping.  It
00965 //! simply casts all the source characters to (UNICODE)
00966 LIBEXPORT UNICODE* strtouc (
00967    UNICODE *dest, 
00968    const char *source
00969    );
00970 
00971 //! Convert an ASCII string to Unicode and append to existing string.
00972 //!
00973 //! Note, this function doesn't do any character set mapping.  It
00974 //! simply casts all the source characters to (UNICODE)
00975 LIBEXPORT UNICODE* strtouccat (
00976    UNICODE *dest, 
00977    const char * source
00978    );
00979 
00980 //! UNICODE version of strdup().
00981 //!
00982 //! Note, this function doesn't do any character set mapping.  It
00983 //! simply casts all the source characters to (UNICODE).
00984 //! The resulting string should be MmFree()'d.
00985 LIBEXPORT UNICODE* strtoucdup (
00986    const char*
00987    );
00988 
00989 //! UNICODE version of isdigit().
00990 //!
00991 //!   It only detects the ASCII numerals 0-9.  
00992 //!   Note, that some implementations of isdigit() will crash if given
00993 //!   Unicode values > 255.
00994 #ifdef __cplusplus
00995 inline bool ucisdigit (UNICODE digit) {
00996    return (digit >= (UNICODE)'0' && digit <= (UNICODE)'9');
00997    }
00998 #else
00999 /* Inlines only work on C++ and some 3rd party libs don't
01000  * compile for C++.  The XmHTML lib is one such and has a module
01001  */
01002 #define ucisdigit(x) ((x) >= (UNICODE)'0' && (x) <= (UNICODE)'9')
01003 #endif
01004 
01005 //! UNICODE version of strcat().
01006 LIBEXPORT UNICODE* ucstrcat (
01007    UNICODE *dest, 
01008    const UNICODE *source
01009    );
01010 
01011 //! UNICODE version of strchr().
01012 //!
01013 //! Find the first occurance of a given character in a Unicode string.
01014 //! Returns a pointer to that character or NULL if not found.
01015 LIBEXPORT UNICODE* ucstrchr (
01016    const UNICODE *p, 
01017    UNICODE value
01018    );
01019 
01020 //! Compare two UNICODE strings.
01021 //!
01022 //! Returns -1 if (p1 < p2),
01023 //!         1 if (p1 > p2)
01024 //!         0 if (p1 == p2)
01025 LIBEXPORT int ucstrcmp (
01026    const UNICODE *p1, 
01027    const UNICODE *p2
01028    );
01029 
01030 //! Unicode version of strcpy().
01031 LIBEXPORT UNICODE* ucstrcpy (
01032    UNICODE *dest, 
01033    const UNICODE *source
01034    );
01035 
01036 //! Unicode version of strdup().
01037 LIBEXPORT UNICODE* ucstrdup (
01038    const UNICODE *s
01039    );
01040 
01041 //! Unicode version of strlen().
01042 LIBEXPORT size_t ucstrlen (
01043    const UNICODE *p
01044    );
01045 
01046 //! Compare two UNICODE strings (case insensitive).
01047 //!
01048 //! Returns -1 if (p1 < p2),
01049 //!         1 if (p1 > p2)
01050 //!         0 if (p1 == p2)
01051 LIBEXPORT int ucstricmp (
01052    const UNICODE *p1, 
01053    const UNICODE *p2
01054    );
01055 
01056 //! Convert UNICODE string to lowercase (in-place).
01057 //!
01058 //! Modifies the input string
01059 LIBEXPORT UNICODE* ucstrlwr (UNICODE *p);
01060 
01061 //! UNICODE version of strncat().
01062 LIBEXPORT UNICODE* ucstrncat (
01063    UNICODE *p1, 
01064    const UNICODE *p2, 
01065    int len
01066    );
01067 
01068 //! Compare two UNICODE strings (length limited).
01069 //!
01070 //! Returns -1 if (p1 < p2),
01071 //!         1 if (p1 > p2)
01072 //!         0 if (p1 == p2)
01073 LIBEXPORT int ucstrncmp (
01074    const UNICODE *p1, 
01075    const UNICODE *p2, 
01076    int len
01077    );
01078 
01079 //! UNICODE version of strncpy().
01080 LIBEXPORT UNICODE* ucstrncpy (
01081    UNICODE *dest, 
01082    const UNICODE *source, 
01083    int len
01084    );
01085 
01086 //! Compare two UNICODE strings (length limited and case insensitive).
01087 //!
01088 //! Returns -1 if (p1 < p2),
01089 //!         1 if (p1 > p2)
01090 //!         0 if (p1 == p2)
01091 LIBEXPORT int ucstrnicmp (
01092    const UNICODE *p1, 
01093    const UNICODE *p2, 
01094    int len
01095    );
01096 
01097 //! UNICODE version of strpbrk().
01098 //!
01099 //! Locate the first occurrence in the string s of any of the characters in the 
01100 //! string accept.  Returns a pointer to it or NULL if not found.
01101 LIBEXPORT UNICODE* ucstrpbrk (
01102    UNICODE *s, 
01103    UNICODE *accept
01104    );
01105 
01106 //! UNICODE version of strrchr().
01107 //!
01108 //! Locates the last occurrence of the character value in the string s.
01109 //! Returns a pointer to it or NULL if not found.
01110 LIBEXPORT UNICODE* ucstrrchr (
01111    UNICODE *s, 
01112    UNICODE value
01113    );
01114 
01115 //! UNICODE version of strspn().
01116 //!
01117 //! Returns the length of the initial segment of s which consists entirely of 
01118 //! characters in accept.
01119 LIBEXPORT size_t ucstrspn (
01120    const UNICODE *s, 
01121    const UNICODE *accept
01122    );
01123 
01124 //! UNICODE version of strstr().
01125 //!
01126 //! Finds the first occurrence of the substring p in the string s.
01127 //! Returns a pointer to it or NULL if not found.
01128 LIBEXPORT UNICODE* ucstrstr (
01129    UNICODE *s, 
01130    const UNICODE *p
01131    );
01132 
01133 //! Extract token from string
01134 //!
01135 //! A "token" is a nonempty string of characters not occurring in the string
01136 //! delim, followed by \\0 or by a character occurring in delim.
01137 //!
01138 //! The ucstrtok() function can be used to parse the string s
01139 //! into tokens.  The first call the ucstrtok() should have s
01140 //! as its first argument.  Subsequent calls should have the
01141 //! first argument set to NULL.  Each call returns a pointer to
01142 //! next token, or