262 lines
12 KiB
C
262 lines
12 KiB
C
|
#ifndef TRIE_H
|
||
|
#define TRIE_H
|
||
|
|
||
|
#include <windows.h>
|
||
|
|
||
|
#ifdef __cplusplus
|
||
|
extern "C" {
|
||
|
#endif
|
||
|
|
||
|
/* Abstract trie node structure. wch is a character to transition on; flags describe various things
|
||
|
about the compressed trie; lpbNode points to the first byte of the next node in this state, and
|
||
|
lpbDown points to the first byte referenced by the down pointer, if any */
|
||
|
|
||
|
typedef struct tagTAGDATA
|
||
|
{
|
||
|
DWORD cTag; // Count of tagged nodes below this node in the subtree
|
||
|
DWORD dwData; // Stored tagged data for this node
|
||
|
} TAGDATA;
|
||
|
|
||
|
#define MAXTAGS 8
|
||
|
#if MAXTAGS > 8
|
||
|
#error No more than 8 tags are allowed
|
||
|
#endif
|
||
|
|
||
|
typedef struct tagTRIESCAN
|
||
|
{
|
||
|
WCHAR wch; // Unicode character
|
||
|
WORD wFlags; // see below
|
||
|
WORD wMask; // which tags are valid
|
||
|
WORD __pad0; //
|
||
|
DWORD cWords; // Words in subtree (only valid if TRIE_NODE_COUNT is set)
|
||
|
DWORD cSkipWords; // Words in subtrees ignored when following a skip pointer
|
||
|
LPBYTE lpbNode; // Address of next byte within the compressed trie
|
||
|
LPBYTE lpbDown; // Address referenced by down pointer, if any
|
||
|
LPBYTE lpbRight; // Address referenced by right pointer, if any
|
||
|
LPBYTE lpbSRDown; // Last single-ref address referenced
|
||
|
TAGDATA aTags[MAXTAGS]; // The list of tag counts/data
|
||
|
} TRIESCAN, *PTRIESCAN, *LPTRIESCAN;
|
||
|
|
||
|
// Trie node flags, only the lower 16 bits of the flags are saved in the trie
|
||
|
|
||
|
#define TRIE_NODE_VALID 0x00000001 // wch is the last letter of a valid word
|
||
|
#define TRIE_NODE_END 0x00000002 // Last node in the state (no more alternatives to wch)
|
||
|
#define TRIE_NODE_COUNT 0x00000004 // The count of words in the subtree is stored in the node
|
||
|
#define TRIE_NODE_TAGGED 0x00000008 // The node has tagged data
|
||
|
#define TRIE_NODE_DOWN 0x00000010 // iDown is valid (word so far is a valid prefix)
|
||
|
#define TRIE_NODE_RIGHT 0x00000020 // iRight is valid (word connects to a substate)
|
||
|
#define TRIE_DOWN_INLINE 0x00000040 // iDown omitted, since it points to next node in memory
|
||
|
#define TRIE_DOWN_MULTI 0x00000080 // iDown is a second reference or worse
|
||
|
#define TRIE_DOWN_ABS 0x00000100 // iDown is an absolute immediate offset into the trie
|
||
|
#define TRIE_NODE_SKIP 0x00000200 // Either iRight is a skip pointer or EOS is a 'soft' EOS
|
||
|
#define TRIE_NODE_SKIP_COUNT 0x00000400 // cSkipWords is valid
|
||
|
|
||
|
/* Macro to access the data in the node, works for dawgs and tries */
|
||
|
|
||
|
#define DAWGDATA(pdawg) ((pdawg)->wch)
|
||
|
#define DAWGDOWNFLAG(pdawg) ((pdawg)->wFlags & TRIE_NODE_DOWN)
|
||
|
#define DAWGENDFLAG(pdawg) ((pdawg)->wFlags & TRIE_NODE_END)
|
||
|
#define DAWGWORDFLAG(pdawg) ((pdawg)->wFlags & TRIE_NODE_VALID)
|
||
|
|
||
|
/* Fixed-length part of the compressed trie header */
|
||
|
|
||
|
typedef struct tagTRIESTATS
|
||
|
{
|
||
|
WORD version; // Version of this particular compressed trie
|
||
|
WORD __pad0; //
|
||
|
BYTE wTagsMask; // Which tags are in use
|
||
|
BYTE wEnumMask; // Which tags have enumeration
|
||
|
BYTE wDataMask; // Which tags have stored data
|
||
|
BYTE cTagFields; // Total tags in use
|
||
|
WORD cMaxWord; // Number of characters in longest word
|
||
|
WORD cMaxState; // Number of nodes in longest state (max alternatives)
|
||
|
WORD cCharFlagsCodesMax; // Bytes in longest char/flags code
|
||
|
WORD cTagsCodesMax; // Bytes in longest tagged data code
|
||
|
WORD cMRPointersCodesMax; // Bytes in longest MR pointer code
|
||
|
WORD cSROffsetsCodesMax; // Bytes in longest Single-ref code
|
||
|
DWORD cWords; // Number of words in dictionary
|
||
|
DWORD cUniqueSROffsets; // Unique offsets in Single-ref segment
|
||
|
DWORD cUniqueCharFlags; // Unique char/flags pairs
|
||
|
DWORD cUniqueTags; // Unique tagged data values
|
||
|
DWORD cUniqueMRPointers; // Unique multi-ref pointers
|
||
|
DWORD cbHeader; // Bytes in header & tables
|
||
|
DWORD cbTrie; // Bytes in trie
|
||
|
} TRIESTATS, *PTRIESTATS, *LPTRIESTATS;
|
||
|
|
||
|
/* Primary unit of a node. Nodes usually contain a pointer too */
|
||
|
|
||
|
typedef struct tagCHARFLAGS {
|
||
|
wchar_t wch;
|
||
|
short wFlags;
|
||
|
} CHARFLAGS, *PCHARFLAGS, *LPCHARFLAGS;
|
||
|
|
||
|
/* Control structure used to decompress the trie */
|
||
|
|
||
|
typedef struct tagTRIECTRL
|
||
|
{
|
||
|
TRIESTATS *lpTrieStats; // Pointer to base of header segment
|
||
|
WORD *lpwCharFlagsCodes; // decoding table for Char/flags
|
||
|
WORD *lpwTagsCodes; // decoding table for tagged data
|
||
|
WORD *lpwMRPointersCodes; // decoding table for multiref pointers
|
||
|
WORD *lpwSROffsetsCodes; // decoding table for singleref offsets
|
||
|
CHARFLAGS *lpCharFlags; // table to convert codes to char/flags
|
||
|
DWORD *lpwTags; // table to convert codes to tagged data
|
||
|
DWORD *lpwMRPointers; // table to convert codes to multiref pointers
|
||
|
DWORD *lpwSROffsets; // table to convert codes to Singleref offsets
|
||
|
BYTE *lpbTrie; // Pointer to the trie.
|
||
|
} TRIECTRL, *PTRIECTRL, *LPTRIECTRL;
|
||
|
|
||
|
/* Useful Constants */
|
||
|
|
||
|
#define TRIE_MAX_DEPTH 128 // We'll fail on any words longer than this
|
||
|
|
||
|
// The prototypes below are plain C (this is required for use with C++)
|
||
|
|
||
|
/* Given a pointer to a mapped file or resource containing a compressed trie,
|
||
|
read the trie into memory, making all the allocations required */
|
||
|
|
||
|
TRIECTRL * WINAPI TrieInit(LPBYTE lpByte);
|
||
|
|
||
|
/* Free all the allocations associated with a trie */
|
||
|
|
||
|
void WINAPI TrieFree(LPTRIECTRL lpTrieCtrl);
|
||
|
|
||
|
void WINAPI TrieDecompressNode(LPTRIECTRL lpTrieCtrl, LPTRIESCAN lpTrieScan);
|
||
|
|
||
|
/* Given a compressed trie and a pointer to a decompresed node from it, find and decompress
|
||
|
the next node in the same state. lpTrieScan is a user-allocated structure that holds the
|
||
|
decompressed node and into which the new node is copied.
|
||
|
This is equivalent to traversing a right pointer or finding the next alternative
|
||
|
letter at the same position. If there is no next node (i.e.this is the end of the state)
|
||
|
then TrieGetNextNode returns FALSE. To scan from the beginning of the trie, set the lpTrieScan
|
||
|
structure to zero */
|
||
|
|
||
|
BOOL WINAPI
|
||
|
TrieGetNextNode(LPTRIECTRL lpTrieCtrl, LPTRIESCAN lpTrieScan);
|
||
|
|
||
|
BOOL WINAPI
|
||
|
TrieSkipNextNode(LPTRIECTRL lpTrieCtrl, LPTRIESCAN lpTrieScan, WCHAR wch);
|
||
|
|
||
|
/* Follow the down pointer to the next state. This is equivalent to accepting the character
|
||
|
in this node and advancing to the next character position. Returns FALSE if there is no
|
||
|
down pointer. This also decompresses the first node in the state, so all the values in
|
||
|
lpTrieScan will be good. */
|
||
|
|
||
|
BOOL WINAPI
|
||
|
TrieGetNextState(LPTRIECTRL lpTrieCtrl, LPTRIESCAN lpTrieScan);
|
||
|
|
||
|
/* Check the validity of a word or prefix. Starts from the root of pTrie looking for
|
||
|
pwszWord. If it finds it, it returns TRUE and the user-provided lpTrieScan structure
|
||
|
contains the final node in the word. If there is no path, TrieCheckWord returns FALSE
|
||
|
To distinguisha valid word from a valid prefix, caller must test
|
||
|
wFlags for fTrieNodeValid. */
|
||
|
|
||
|
BOOL WINAPI
|
||
|
TrieCheckWord(LPTRIECTRL lpTrieCtrl, LPTRIESCAN lpTrieScan, wchar_t * lpwszWord);
|
||
|
|
||
|
int WINAPI
|
||
|
TrieWordToIndex(
|
||
|
TRIECTRL *ptc, // Trie in which to find word index
|
||
|
wchar_t *pwszWord // Word for which we're looking
|
||
|
);
|
||
|
|
||
|
BOOL WINAPI
|
||
|
TrieIndexToWord(
|
||
|
TRIECTRL *ptc, // Trie in which to find indexed word
|
||
|
DWORD nIndex, // Index for which we're looking
|
||
|
wchar_t *pwszWord, // Returned word
|
||
|
int cwc // Max characters in buffer (including NULL)
|
||
|
);
|
||
|
|
||
|
int WINAPI
|
||
|
TrieWordToTagIndex(
|
||
|
TRIECTRL *ptc, // Trie in which to find word index
|
||
|
wchar_t *pwszWord, // Word for which we're looking
|
||
|
int tag // Which tag to enumerate
|
||
|
);
|
||
|
|
||
|
BOOL WINAPI
|
||
|
TrieTagIndexToWord(
|
||
|
TRIECTRL *ptc, // Trie in which to find indexed word
|
||
|
DWORD nIndex, // Index for which we're looking
|
||
|
wchar_t *pwszWord, // Returned word
|
||
|
int cwc, // Max characters in buffer (including NULL)
|
||
|
int tag // Which tag to enumerate
|
||
|
);
|
||
|
|
||
|
BOOL WINAPI
|
||
|
TrieGetTagsFromWord(
|
||
|
TRIECTRL *ptc, // Trie in which to find word
|
||
|
wchar_t *pwszWord, // Word for which we're looking
|
||
|
DWORD *pdw, // Returned values
|
||
|
BYTE *pbValid // Mask for valid return values
|
||
|
);
|
||
|
|
||
|
int WINAPI
|
||
|
TriePrefixToRange(
|
||
|
TRIECTRL *ptc, // Trie in which to find prefix range
|
||
|
wchar_t *pwszWord, // Prefix for which we're looking
|
||
|
int *piStart // Start of range with this prefix
|
||
|
);
|
||
|
|
||
|
/**** Subroutines for traversing Directed Acyclic Word Graphs ****/
|
||
|
|
||
|
/* Abstract trie node structure. wch is a character to transition on; flags describe various things
|
||
|
about the compressed trie; iDown indexes the first node in the state wch transitions to. DAWG is a special
|
||
|
kind of trie: a "Directed Acyclic Word Graph," essentially an ending-compressed trie. */
|
||
|
|
||
|
typedef struct tagDAWGNODE
|
||
|
{
|
||
|
DWORD wch; // Unicode character
|
||
|
DWORD wFlags; // see below
|
||
|
DWORD cWords; // Words below this node in the subtree
|
||
|
DWORD cSkipWords; // Words below skipped nodes
|
||
|
DWORD iDown; // Offset of first node in next state
|
||
|
DWORD iRight; // Offset to first node in next substate
|
||
|
DWORD cTags[8]; // Count of tagged nodes below this node in the subtree
|
||
|
DWORD dwData[8]; // Stored tagged data for this node
|
||
|
} DAWGNODE, *PDAWGNODE, *LPDAWGNODE;
|
||
|
|
||
|
/* Given a trie and a pointer to a node in it, find the next node in that state.
|
||
|
This is equivalent to traversing a right pointer or finding the next alternative
|
||
|
letter at the same position. Returns a pointer to the new node, NULL if there is
|
||
|
no next node (i.e. if this is the end of a state).*/
|
||
|
|
||
|
DAWGNODE * WINAPI DawgGetNextNode(void *pTrie, DAWGNODE *pTrieNode);
|
||
|
|
||
|
/* From this node, find the first node in the state it points to. This is equivalent
|
||
|
to traversing a down pointer or extending the word one letter and finding the first
|
||
|
alternative. Returns a pointer to the first node in the new state, NULL if there is
|
||
|
no down pointer. To find the first state in the trie, use pTrieNode == NULL */
|
||
|
|
||
|
DAWGNODE * WINAPI DawgGetNextState(void *pTrie, DAWGNODE *pTrieNode);
|
||
|
|
||
|
/* Check the validity of a word or prefix. Starts from the root of pTrie looking for
|
||
|
pwszWord. If it finds it, it returns a pointer to the terminal node in pTrie Returns
|
||
|
NULL if there is no path through the trie that corresponds to pwszWord. To distinguish
|
||
|
a valid word from a valid prefix, caller must test wFlags for fTrieNodeValid. */
|
||
|
|
||
|
DAWGNODE * WINAPI DawgCheckWord(void *pTrie, wchar_t *pwszWord);
|
||
|
|
||
|
/* Walk the trie from pTrieNode, calling pfnTrieWord on every valid word. pvParam is passed through
|
||
|
to pfnTrieWord. If pfnTrieWord returns non-zero, the enumeration stops. pwszWord must point to a
|
||
|
space of cwchTrieWordMax+1 wchar_t's. To walk the entire trie, pass NULL for pTrieNode. Returns
|
||
|
the number of words traversed. pfnTrieWord may be null if all you want is the count of words. */
|
||
|
|
||
|
int WINAPI
|
||
|
DawgEnumerate(
|
||
|
void *pTrie, // Trie to enumerate
|
||
|
DAWGNODE *pTrieNodeStart, // point to enumerate from, NULL if all
|
||
|
wchar_t *pwszWord, // buffer to hold words being enumerated
|
||
|
void *pvParam, // parameter to pass to pfnTrieWord
|
||
|
int (*pfnTrieWord)(wchar_t *pwszWord, void *pvParam)
|
||
|
);
|
||
|
|
||
|
// end plain C Prototypes
|
||
|
|
||
|
#ifdef __cplusplus
|
||
|
}
|
||
|
#endif
|
||
|
|
||
|
#endif // TRIE_H
|