144 lines
3.8 KiB
C
144 lines
3.8 KiB
C
/*
|
|
* nputf.c - Routines for utf text processing for notepad
|
|
*
|
|
* Copyright (C) 1998-2001 Microsoft Inc.
|
|
*/
|
|
|
|
#include "precomp.h"
|
|
|
|
|
|
/* IsTextUTF8
|
|
*
|
|
* UTF-8 is the encoding of Unicode based on Internet Society RFC2279
|
|
* ( See http://www.cis.ohio-state.edu/htbin/rfc/rfc2279.html )
|
|
*
|
|
* Basicly:
|
|
* 0000 0000-0000 007F - 0xxxxxxx (ascii converts to 1 octet!)
|
|
* 0000 0080-0000 07FF - 110xxxxx 10xxxxxx ( 2 octet format)
|
|
* 0000 0800-0000 FFFF - 1110xxxx 10xxxxxx 10xxxxxx (3 octet format)
|
|
* (this keeps going for 32 bit unicode)
|
|
*
|
|
*
|
|
* Return value: TRUE, if the text is in UTF-8 format.
|
|
* FALSE, if the text is not in UTF-8 format.
|
|
* We will also return FALSE is it is only 7-bit ascii, so the right code page
|
|
* will be used.
|
|
*
|
|
* Actually for 7 bit ascii, it doesn't matter which code page we use, but
|
|
* notepad will remember that it is utf-8 and "save" or "save as" will store
|
|
* the file with a UTF-8 BOM. Not cool.
|
|
*/
|
|
|
|
|
|
INT IsTextUTF8( LPSTR lpstrInputStream, INT iLen )
|
|
{
|
|
INT i;
|
|
DWORD cOctets; // octets to go in this UTF-8 encoded character
|
|
UCHAR chr;
|
|
BOOL bAllAscii= TRUE;
|
|
|
|
cOctets= 0;
|
|
for( i=0; i < iLen; i++ ) {
|
|
chr= *(lpstrInputStream+i);
|
|
|
|
if( (chr&0x80) != 0 ) bAllAscii= FALSE;
|
|
|
|
if( cOctets == 0 ) {
|
|
//
|
|
// 7 bit ascii after 7 bit ascii is just fine. Handle start of encoding case.
|
|
//
|
|
if( chr >= 0x80 ) {
|
|
//
|
|
// count of the leading 1 bits is the number of characters encoded
|
|
//
|
|
do {
|
|
chr <<= 1;
|
|
cOctets++;
|
|
}
|
|
while( (chr&0x80) != 0 );
|
|
|
|
cOctets--; // count includes this character
|
|
if( cOctets == 0 ) return FALSE; // must start with 11xxxxxx
|
|
}
|
|
}
|
|
else {
|
|
// non-leading bytes must start as 10xxxxxx
|
|
if( (chr&0xC0) != 0x80 ) {
|
|
return FALSE;
|
|
}
|
|
cOctets--; // processed another octet in encoding
|
|
}
|
|
}
|
|
|
|
//
|
|
// End of text. Check for consistency.
|
|
//
|
|
|
|
if( cOctets > 0 ) { // anything left over at the end is an error
|
|
return FALSE;
|
|
}
|
|
|
|
if( bAllAscii ) { // Not utf-8 if all ascii. Forces caller to use code pages for conversion
|
|
return FALSE;
|
|
}
|
|
|
|
return TRUE;
|
|
}
|
|
|
|
|
|
/* IsInputTextUnicode
|
|
* Verify if the input stream is in Unicode format.
|
|
*
|
|
* Return value: TRUE, if the text is in Unicode format.
|
|
*
|
|
* 29 June 1998
|
|
*/
|
|
|
|
|
|
INT IsInputTextUnicode (LPSTR lpstrInputStream, INT iLen)
|
|
{
|
|
INT iResult= ~0; // turn on IS_TEXT_UNICODE_DBCS_LEADBYTE
|
|
BOOL bUnicode;
|
|
|
|
bUnicode= IsTextUnicode( lpstrInputStream, iLen, &iResult);
|
|
|
|
// this code is not required as IsTextUnicode does the required checks
|
|
// and it's legal to have a unicode char with a DBCS leading byte!
|
|
|
|
#ifdef UNUSEDCODE
|
|
{
|
|
|
|
if (bUnicode &&
|
|
((iResult & IS_TEXT_UNICODE_STATISTICS) != 0 ) &&
|
|
((iResult & (~IS_TEXT_UNICODE_STATISTICS)) == 0 ) )
|
|
{
|
|
CPINFO cpiInfo;
|
|
CHAR* pch= (CHAR*)lpstrInputStream;
|
|
INT cb;
|
|
|
|
//
|
|
// If the result depends only upon statistics, check
|
|
// to see if there is a possibility of DBCS.
|
|
// Only do this check if the ansi code page is DBCS
|
|
//
|
|
|
|
GetCPInfo( CP_ACP, &cpiInfo);
|
|
|
|
if( cpiInfo.MaxCharSize > 1 )
|
|
{
|
|
for( cb=0; cb<iLen; cb++ )
|
|
{
|
|
if( IsDBCSLeadByte(*pch++) )
|
|
{
|
|
return FALSE;
|
|
}
|
|
}
|
|
}
|
|
}
|
|
}
|
|
|
|
#endif
|
|
|
|
return bUnicode;
|
|
}
|