194 lines
6.9 KiB
C++
194 lines
6.9 KiB
C++
// CaseMap.cpp -- Unicode case mapping routines for locale 0x0409
|
|
#include "stdafx.h"
|
|
|
|
/*
|
|
|
|
The data constants and code below implement case mapping for the US English
|
|
locale. The primary goal here was to make the size of code plus data as small
|
|
as possible while keeping reasonable run-time speed.
|
|
|
|
The easiest implmentation would have been to simply define a 65,536 entry
|
|
table of USHORT values. Then the case mapping would be just an array indexing
|
|
operation. However the data size to upper and lower case mapping would be
|
|
256 K bytes. That clearly unacceptable -- especially since there are only 736
|
|
letter characters in the Unicode set.
|
|
|
|
The next approach is to make a short list of the letter positions together
|
|
with the corresponding case-mapped positions. That gives us a data size of
|
|
5,888 bytes (8 x 736). Then we also need code to binary-search the list of
|
|
letter positions to see if a particular code point has a case mapping and,
|
|
if so, to locate the corresponding mapped value.
|
|
|
|
This is good, but we can do better by noticing that the case mapping is not
|
|
random. Quite often a continguous block of Unicode code points map to positions
|
|
with the same relative offset. In the Ascii section, for example the 26 lower
|
|
case letters all map down by 0x20 positions, while the 26 upper case letters
|
|
map up by 0x20 positions. In other areas of Unicode we find that quite often
|
|
every other position uses the same relative offset.
|
|
|
|
That observation together with some simple methods to pack information
|
|
efficiently leads to the current implementation which uses 840 bytes of data
|
|
and a look-up algorithm which is just a little bit more complicated than a
|
|
binary search.
|
|
|
|
We could probably make the data smaller still by using a more complicated
|
|
data structure and a more complicated algorithm, but it isn't clear that the
|
|
additional effort would be worthwhile. That is, the additional code space may
|
|
well be larger than the data-space savings.
|
|
|
|
*/
|
|
|
|
#include "CaseTab.h"
|
|
|
|
// The code below does case mapping using a binary search to find the appropriate
|
|
// code map block and then applying that block. For the case insensitive string
|
|
// comparison, we keep the most recently used block around so that we can avoid
|
|
// the binary search in many cases.
|
|
|
|
static CodeMapBlock Find_0x0409_Map_Block(WCHAR wc,
|
|
const CodeMapBlock *pCMB,
|
|
UINT cBlocks
|
|
)
|
|
{
|
|
UINT iBlockLow = 0;
|
|
|
|
if (wc < pCMB[iBlockLow].iwcFirst)
|
|
return UCMB_NULL.cmb; // Map block with zero entries based at zero offset.
|
|
|
|
UINT iBlockHigh = cBlocks;
|
|
|
|
for (;;)
|
|
{
|
|
UINT iBlockMid = (iBlockLow + iBlockHigh) >> 1;
|
|
|
|
CodeMapBlock mblk = pCMB[iBlockMid];
|
|
|
|
if (iBlockMid == iBlockLow)
|
|
return mblk; // iBlockHigh must have been iBlockLow + 1.
|
|
|
|
if (wc >= mblk.iwcFirst)
|
|
iBlockLow = iBlockMid;
|
|
else iBlockHigh = iBlockMid;
|
|
}
|
|
}
|
|
|
|
static WCHAR Map_from_0x0409_Block(WCHAR wc,
|
|
CodeMapBlock mblk,
|
|
const short *paiDeltaValues
|
|
)
|
|
{
|
|
UINT iBaseNew = mblk.iwcFirst;
|
|
|
|
if (wc >= iBaseNew + mblk.cwcSpan)
|
|
return wc;
|
|
|
|
if (mblk.fGapIs2 && UINT(wc & 1) != (iBaseNew & 1))
|
|
return wc;
|
|
|
|
return wc + paiDeltaValues[mblk.iDelta];
|
|
}
|
|
|
|
static WCHAR Map_to_0x0409_Case(WCHAR wc,
|
|
const short *paiDeltaValues,
|
|
const CodeMapBlock *pCMB,
|
|
UINT cBlocks
|
|
)
|
|
{
|
|
return Map_from_0x0409_Block
|
|
(wc,
|
|
Find_0x0409_Map_Block(wc, pCMB, cBlocks),
|
|
paiDeltaValues
|
|
);
|
|
}
|
|
|
|
static WCHAR Map_to_0x0409_Lower_with_History(WCHAR wc,
|
|
CodeMapBlock &mblkLower,
|
|
CodeMapBlock &mblkUpper
|
|
)
|
|
{
|
|
// This routine does a lower case mapping optimized for text which is mostly
|
|
// letters. It also looks for characters which commonly occur in file and
|
|
// stream paths.
|
|
//
|
|
// The main trick here is to keep track of the last letter mapping we used
|
|
// because it is probably still valid. If it isn't we adjust the mappings
|
|
// to match the kind of letter character we're processing.
|
|
|
|
if ( wc < L'A' // Below the first letter?
|
|
|| wc == L'\\' // Path separator?
|
|
)
|
|
return wc;
|
|
|
|
if (wc >= mblkLower.iwcFirst && wc < mblkLower.iwcFirst + mblkLower.cwcSpan)
|
|
return Map_from_0x0409_Block(wc, mblkLower, aiDeltaValues_Lower);
|
|
|
|
if (wc >= mblkUpper.iwcFirst && wc < mblkUpper.iwcFirst + mblkUpper.cwcSpan)
|
|
{
|
|
if (wc != Map_from_0x0409_Block(wc, mblkUpper, aiDeltaValues_Upper))
|
|
return wc; // WC was a lower case letter already!
|
|
}
|
|
|
|
CodeMapBlock mblkLC = Find_0x0409_Map_Block
|
|
(wc,
|
|
&(UCMB_Lower->cmb),
|
|
sizeof(UCMB_Lower) / sizeof(UCodeMapBlock)
|
|
);
|
|
|
|
CodeMapBlock mblkUC = Find_0x0409_Map_Block
|
|
(wc,
|
|
&(UCMB_Upper->cmb),
|
|
sizeof(UCMB_Upper) / sizeof(UCodeMapBlock)
|
|
);
|
|
|
|
WCHAR wcLC = Map_from_0x0409_Block(wc, mblkLC, aiDeltaValues_Lower);
|
|
WCHAR wcUC = Map_from_0x0409_Block(wc, mblkUC, aiDeltaValues_Upper);
|
|
|
|
if (wcLC != wc || wcUC != wc) // Was wc a letter?
|
|
{
|
|
mblkLower = mblkLC;
|
|
mblkUpper = mblkUC;
|
|
}
|
|
|
|
return wcLC;
|
|
}
|
|
|
|
INT wcsicmp_0x0409(const WCHAR * pwcLeft, const WCHAR *pwcRight)
|
|
{
|
|
CodeMapBlock mblkUC = UCMB_NULL.cmb;
|
|
CodeMapBlock mblkLC = UCMB_NULL.cmb;
|
|
|
|
const WCHAR *pwcLeftBase = pwcLeft;
|
|
const WCHAR *pwcRightBase = pwcRight;
|
|
|
|
// The code below returns zero when the two strings differ only by case.
|
|
// Otherwise the value it returns will order strings by their Unicode character
|
|
// values. This is important for later path manager implementations which use
|
|
// Trie structures.
|
|
|
|
for (;;)
|
|
{
|
|
WCHAR wcLeft = Map_to_0x0409_Lower_with_History(*pwcLeft ++, mblkLC, mblkUC);
|
|
WCHAR wcRight = Map_to_0x0409_Lower_with_History(*pwcRight++, mblkLC, mblkUC);
|
|
|
|
INT diff= wcLeft - wcRight;
|
|
|
|
if (diff || !wcLeft)
|
|
return diff;
|
|
}
|
|
}
|
|
|
|
WCHAR WC_To_0x0409_Upper(WCHAR wc)
|
|
{
|
|
return Map_to_0x0409_Case(wc, (const short*) &aiDeltaValues_Upper, &(UCMB_Upper->cmb),
|
|
sizeof(UCMB_Upper) / sizeof(UCodeMapBlock)
|
|
);
|
|
}
|
|
|
|
WCHAR WC_To_0x0409_Lower(WCHAR wc)
|
|
{
|
|
return Map_to_0x0409_Case(wc, (const short *) &aiDeltaValues_Lower, &(UCMB_Lower->cmb),
|
|
sizeof(UCMB_Lower) / sizeof(UCodeMapBlock)
|
|
);
|
|
}
|
|
|