// utf.h //----------------------------------------------------------------- // Microsoft Confidential // Copyright 1998 Microsoft Corporation. All Rights Reserved. // // June 1, 1998 [paulde] Revised for UniLib, surrogates // // Routines are documented in more detail below the declarations in // the "-- DOCUMENTATION --" section. // //----------------------------------------------------------------- #ifdef _MSC_VER #pragma once #endif #ifndef __UTF_H__ #define __UTF_H__ // See "About the UTF-8 file signature" below for information and usage. // EF BB BF #define UTF8SIG "\xEF\xBB\xBF" #define UTF8SIGLEN (3) // U8TU_* flags returned by UTF8ToUnicode[Info] #define U8TU_NONASCII 0x00000001 // Info : found non-ASCII chars #define U8TU_UCS4 0x00000002 // Info : found UCS-4 (4-byte) chars. The Unicode data contains surrogates. #define U8TU_OVERLONG 0x00000004 // Info: found UTF-8 sequence longer than required: char converted to UCH_REPLACE #define U8TU_TRAIL_NO_COUNT 0x80000100 // Error: trail byte with 0 trail count #define U8TU_COUNT_NO_TRAIL 0x80000200 // Error: nonzero trail count but no trail byte #define U8TU_UCS4OUTOFRANGE 0x80000400 // Error: UCS4 char is out of range to represent in Unicode // Test a U8TU_... value for errors #define U8TU_IsError(dw) (0 != ((dw) & 0x80000000)) // NOTE: UTF-8 encoders are supposed to use the shortest possible sequence to represent a character. // Overlong sequences are either a bug in the UTF-8 encoder, or an attempt to use overlong data // as a covert channel or to circumvent security. Overlong sequences generate UCH_REPLACE instead of // the character, and we flag U8TU_OVERLONG, but this is not an error. // // VU16_* flags returned by ValidateUTF16 #define VU16_NONASCII 0x00000001 // Info : found non-ASCII chars #define VU16_UCS4 0x00000002 // Info : found non-BMP chars. The Unicode data contains surrogates. #define VU16_NONCHAR 0x00000004 // Info : found noncharacters (U+xFFFE and U+xFFFF, where x is from 0 to 0x10, // as well as the values U+FDD0..U+FDEF) #define VU16_UNPAIRSURROGATE 0x80000100 // Error: high surrogate without a low surrogate, or vice versa // Test a VU16_... value for errors #define VU16_IsError(dw) (0 != ((dw) & 0x80000000)) #define NULL_TERMINATED_MODE (-1L) // !!!!!!!!!!!!!!!!!!!!!!!!!! WARNING !!!!!!!!!!!!!!!!!!!!!!!!!! // // DO NOT pass NULL output buffers to get the required buffer size. // // To get the length of output buffers, use the explicit length functions: // UnicodeLengthOfUTF8, UTF8LengthOfUnicode. // // "just convert it as fast as possible" int WINAPI UTF8ToUTF16 ( /* __in_xcount(cbUTF) */ PCSTR pUTF8, int cbUTF, __out_ecount(cchUTF16) PWSTR pUTF16, int cchUTF16); // "convert it and tell me all about it" int WINAPI UTF8ToUTF16Info ( /* __in_xcount(*pcbUTF8) */ PCSTR pUTF8, int * pcbUTF8, __out_ecount(cchUTF16) PWSTR pUTF16, int cchUTF16, DWORD * pdwInfo); // "tell me ALL about it" // when fScanAll is FALSE, errors immediately terminate scanning the data int WINAPI GetUTF8Info ( /* __in_xcount(*pcbUTF8) */ PCSTR pUTF8, int * pcbUTF8, DWORD * pdwInfo, BOOL fScanAll = FALSE); // "just tell me how many wchars I'll need, as fast as possible" int WINAPI UTF16LengthOfUTF8 ( /* __in_xcount(cbUTF8) */ PCSTR pUTF8, int cbUTF8); int WINAPI UTF16ToUTF8 ( /* __in_xcount(*pcchUTF16) */ PCWSTR pUTF16, int * pcchUTF16, __out_ecount(cbUTF8) PSTR pUTF8, int cbUTF8); // "tell me exactly how many chars I need to convert" int WINAPI UTF8LengthOfUTF16 ( /* __in_xcount(cchUTF16) */ PCWSTR pUTF16, int cchUTF16); // "tell me about any errors in this UTF-16 text" DWORD WINAPI ValidateUTF16 (PCWSTR pUTF16, int cchUTF16, BOOL fScanAll = FALSE); //================= COMPATIBILITY WRAPPERS ======================== // // UTF-16 is the more precise term, but there is existing code that // uses "Unicode" to refer to UTF-16. inline int WINAPI UTF8ToUnicode( /* __in_xcount(cbUTF) */ PCSTR pUTF8, int cbUTF, __out_ecount(cchUni) PWSTR pUni, int cchUni) { return UTF8ToUTF16(pUTF8, cbUTF, pUni, cchUni); } inline int WINAPI UTF8ToUnicodeInfo( /* __in_xcount(cbUTF) */PCSTR pUTF8, int * pcbUTF8, __out_ecount(cchUni) PWSTR pUni, int cchUni, DWORD * pdwInfo) { return UTF8ToUTF16Info(pUTF8, pcbUTF8, pUni, cchUni, pdwInfo); } inline int WINAPI UnicodeLengthOfUTF8( /* __in_xcount(cbUTF8) */ PCSTR pUTF8, int cbUTF) { return UTF16LengthOfUTF8(pUTF8, cbUTF); } inline int WINAPI UnicodeToUTF8( /* __in_ecount(*pcchUni) */ PCWSTR pUni, int * pcchUni, __out_ecount(cbUTF) PSTR pUTF8, int cbUTF) { return UTF16ToUTF8(pUni, pcchUni, pUTF8, cbUTF); } inline int WINAPI UTF8LengthOfUnicode(PCWSTR pUni, int cchUni) { return UTF8LengthOfUTF16( /* __in_xcount(cchUni) */ pUni, cchUni); } //===================== DOCUMENTATION ============================= //----------------------------------------------------------------- // // int UTF8ToUTF16Info (PCSTR pUTF8, int * pcbUTF8, PWSTR pUTF16, int cchUTF16, DWORD * pdwInfo); // // Convert UTF8 to UTF-16 // // pUTF8 UTF-8 data // pcbUTF8 IN : Count of UTF-8 bytes to convert, or NULL_TERMINATED_MODE. // OUT: Count of UTF-8 bytes converted. // pUTF16 Buffer for converted UTF-16 text // cchUTF16 Size of UTF-16 buffer in WCHARs. // pdwInfo NULL or address of flags for errors/information. // See U8TU_* flags above for more info. // // Return: // Count of 16-bit code units written, including 0 terminator // if NULL_TERMINATED_MODE. // // The conversion always completes, even in the presence of errors. *pdwInfo contains // status and error information. When there are conversion errors, the Unicode buffer // may contain one or more of character UCH_REPLACE (0xFFFD) "REPLACEMENT CHARACTER" // for un-convertible data. // //----------------------------------------------------------------- // int GetUTF8Info (PCSTR pUTF8, int * pcbUTF8, DWORD * pdwInfo, BOOL fScanAll = FALSE); // // Get size and optional information/errors for conversion of UTF-8 to UTF-16. // // pUTF8 UTF-8 data // pcbUTF8 IN : Count of UTF-8 bytes to scan, or NULL_TERMINATED_MODE. // OUT: Count of valid UTF-8 scanned. // pdwInfo Information and errors in the conversion. // See U8TU_* flags above for more info. // fScanAll TRUE : scan entire UTF-8 data // FALSE: stop scanning at the first error and return // Return: // Count of 16-bit code units required to represent the characters scanned, // including 0 terminator if NULL_TERMINATED_MODE // //----------------------------------------------------------------- // DWORD ValidateUTF16 (PCWSTR pUTF16, int cchUTF16, BOOL fScanAll = FALSE) // // Get size and optional information/errors for conversion of UTF-8 to UTF-16. // // pUTF16 UTF-16 data // cchUTF16 Count of 16-bit code units to scan, or NULL_TERMINATED_MODE. // fScanAll TRUE : scan entire text // FALSE: stop scanning at the first error and return // Return: // Information and errors in the text -- See VU16_* flags above for more info. // //----------------------------------------------------------------- // // int UTF16ToUTF8 (PCWSTR pUTF16, int * pcchUTF16, PSTR pUTF8, int cbUTF8) // // Convert UTF-16 to UTF-8 // // pUTF16 Source UTF-16 data // pcchUTF16 in: Count of 16-bit code units to convert, or NULL_TERMINATED_MODE. // out: count of 16-bit code units converted. // pUTF8 Destination buffer. // cbUTF8 Count of bytes in pUTF8. // // Returns: // Number of bytes written, including 0 terminator if NULL_TERMINATED_MODE. // // //----------------------------------------------------------------- // int UTF8LengthOfUTF16 (PCWSTR pUTF16, int cchUTF16) // // Get exact number of bytes required to convert // //----------------------------------------------------------------- // About the UTF-8 file signature // ------------------------------- // The UTF-8 file signature is the UTF-8 encoding of the Unicode byte order mark. // the byte order mark is generally used as a signature for Unicode files. // // When writing plain-text UTF-8 files, begin the file with this signature. // // When reading plain-text files: // // If you see the UTF-8 signature, you can assume that the file is UTF-8. // // If there is no UTF-8 signature, you can try converting. If the conversion // has no errors, you can be fairly sure that the file is UTF-8 and not // an MBCS plain-text file. This cannot be 100% positive, however. It is // possible to construct MBCS files in various codepages that also happen to // be valid UTF-8. // //----------------------------------------------------------------- #endif // __UTF_H__