; ; search.asm ; ; 08/16/96 jforbes ASM implementation of binary_search_findmatch() ; ; There is a fair amount of optimisation towards instruction-scheduling. ; ; About 58% of the time is spent in the binary_search_findmatch() ; routine. Around 31% is spent in the optimal parser. ; TITLE SEARCH.ASM .386P .model FLAT PUBLIC _binary_search_findmatch _TEXT SEGMENT INCLUDE offsets.i $match_length EQU 0 $small_len EQU 4 $small_ptr EQU 8 $big_ptr EQU 12 $end_pos EQU 16 $clen EQU 20 $left EQU 24 $right EQU 28 $mem_window EQU 32 $matchpos_table EQU 36 $context EQU 40 $best_repeat EQU 44 LOCAL_STACK EQU 48 MIN_MATCH EQU 2 MAX_MATCH EQU 257 BREAK_LENGTH EQU 50 ; ; binary_search_findmatch(t_encoder_context *context, long BufPos) ; _binary_search_findmatch PROC NEAR push ebx push ecx push edx push esi push edi push ebp mov ebp, [esp + 28] ; context mov esi, [esp + 32] ; bufpos ; tree_to_use = *((ushort *) &enc_MemWindow[BufPos]) mov edi, [ebp + OFF_MEM_WINDOW] ; edi = _enc_MemWindow xor eax, eax mov ax, WORD PTR [edi + esi] ; eax = tree_to_use sub esp, LOCAL_STACK ; allocate space for stack vars mov [esp + $mem_window], edi mov [esp + $context], ebp lea ecx, [ebp + OFF_MATCHPOS_TABLE] mov [esp + $matchpos_table], ecx mov ecx, [ebp + OFF_TREE_ROOT] mov ebx, [ecx + eax*4] ; ebx = tree_root[tree_to_use] mov [ecx + eax*4], esi ; tree_root[tree_to_use] = bufpos lea edx, [esi + 4] ; edx = BufPos+4 sub edx, [ebp + OFF_WINDOW_SIZE] ; endpos = BufPos-(ws-4) mov [esp + $end_pos], edx ; if (ptr <= endpos) ; have a short "stub" jump so that the jump is paired cmp ebx, edx jle SHORT close_ptr_le_endpos ; ; for main loop: ; ; eax = scratch ; ebx = ptr ; ecx = same ; edx = scratch ; esi = BufPos ; edi = scratch ; ebp = big_len ; ; ; The following instructions have been carefully ; interleaved for simultaneous execution on a Pentium's ; U and V pipelines. ; mov edi, 2 ; commonly used constant here mov edx, [ebp + OFF_LEFT] mov [esp + $left], edx mov [esp + $clen], edi ; clen = 2 lea edx, [edx + esi*4] ; edx = &Left[BufPos] lea eax, [esi + edi] ; eax = BufPos+2 mov [esp + $small_ptr], edx ; smallptr=&Left[BufPos] mov [esp + $match_length], edi ; match_length = 2 mov edx, [ebp + OFF_RIGHT] mov [esp + $right], edx sub eax, ebx ; eax = BufPos-ptr+2 lea edx, [edx + esi*4] ; edx = &Right[BufPos] mov [esp + $small_len], edi ; small_len = 2 mov [esp + $big_ptr], edx ; bigptr=&Right[BufPos] mov ecx, edi ; same = 2 (first iter) ; enc_matchpos_table[2] = BufPos - ptr + 2 mov edi, [esp + $mem_window] mov [ebp + OFF_MATCHPOS_TABLE + 8], eax add edi, ecx ; u edi = &enc_MemWindow[clen] mov ebp, 2 ; v big_len = 2 mov eax, [edi + esi] ; u *(DWORD*) enc_MemWindow[b] (bufpos+clen) jmp SHORT main_loop ; v close_ptr_le_endpos: jmp ptr_le_endpos ; ; same <= big_len ; ; this code is actually replicated much later in this file, ; but it's too far away for a SHORT jump, which will cause ; pipeline stalls. ; close_same_le_biglen: mov edx, [esp + $left] ; u mov eax, [esp + $big_ptr] ; v lea edi, [edx + ebx*4] ; u edi=&Left[ptr] mov [eax], ebx ; v *big_ptr=ptr mov [esp + $big_ptr], edi ; u big_ptr=&left[ptr] mov ecx, DWORD PTR [esp + $clen] ; v clen (next iter.) mov ebx, [edi] ; u ptr = *big_ptr mov edi, [esp + $mem_window] ; v (next iter.) ; bottom of main loop add edi, ecx ; u edi = &enc_MemWindow[clen] cmp ebx, [esp + $end_pos] ; v ; for next iteration mov eax, [edi + esi] ; u *(DWORD*) enc_MemWindow[b] (bufpos+clen) ja SHORT main_loop ; v ; fall through close_exit_main_loop: jmp exit_main_loop ; ; same <= small_len ; ; ditto - see above ; close_same_le_smalllen: mov edx, [esp + $right] mov eax, [esp + $small_ptr] lea edi, [edx + ebx*4] ; u edi = &Right[ptr] mov [eax], ebx ; v *small_ptr = ptr mov [esp + $small_ptr], edi ; u small_ptr = &right[ptr] mov ecx, [esp + $clen] ; v for next iteration mov ebx, [edi] ; u ptr = *small_ptr mov edi, [esp + $mem_window] ; v (next iter.) ; bottom of main loop add edi, ecx ; u (next iter.) cmp ebx, [esp + $end_pos] ; v mov eax, [edi + esi] ; u (next iter.) jna SHORT close_exit_main_loop ; v ; fall through to main loop ; ; at the bottom of the main loop, we goto here ; main_loop: ; ; If the first characters don't match, then we know for ; certain that we have not exceeded small_len or big_len, ; and therefore clen won't change either. We can therefore ; skip some of the checks. ; ; This is the most common case. ; ; These jumps must be SHORT to be paired. ; cmp [edi + ebx], al ; u ja SHORT close_same_le_smalllen ; v jb SHORT close_same_le_biglen ; u shr eax, 8 ; u inc ecx ; same++ ; v ; ; second and further iterations ; ; we only check same (ecx) against MAX_MATCH ; every 4 characters ; ; operations paired for U and V pipeline ; simultaneous execution ; ; notes: ; SHR must be on the U pipeline ; unrolled_loop: ; 1 cmp [edi + ebx + 1], al ; u jne SHORT not_eq ; v shr eax, 8 ; u inc ecx ; v ; 2 cmp [edi + ebx + 2], al jne SHORT not_eq shr eax, 8 inc ecx ; 3 cmp [edi + ebx + 3], al jne SHORT not_eq mov eax, [edi + esi + 4] ; u inc ecx ; v mov dl, [edi + ebx + 4] ; u add edi, 4 ; v ; 4 cmp dl, al jne SHORT not_eq shr eax, 8 inc ecx cmp ecx, MAX_MATCH jl SHORT unrolled_loop ; ; clen >= MAX_MATCH ; ; ecx could be larger than MAX_MATCH right now, ; so correct it ; mov edx, [esp + $match_length] mov ecx, MAX_MATCH jmp SHORT long_match same1_ge_break_length: same2_ge_break_length: ; can trash clen (ecx) ; ecx = left mov ecx, [esp + $left] ; eax = small_ptr mov eax, [esp + $small_ptr] ; ecx = Left[ptr] mov ecx, [ecx + ebx*4] ; edx = Right mov edx, [esp + $right] ; *small_ptr = left[ptr] mov [eax], ecx ; *big_ptr = right[ptr] mov edx, [edx + ebx*4] ; *big_ptr = right[ptr] mov eax, [esp + $big_ptr] mov [eax], edx ; goto end_bsearch jmp end_bsearch ; ; warning, "same" (ecx) could be larger than ; MAX_MATCH, so we will have to correct it ; not_eq: ja val_greater_than_0 ; ; ----------------------------------------- ; VAL < 0 ; ----------------------------------------- ; val_less_than_0: ; if (same > big_len) cmp ecx, ebp jle SHORT same_le_biglen ; if (same > match_length) cmp ecx, [esp + $match_length] jle SHORT same1_le_ml ; here's where we truncate ecx to MAX_MATCH if it ; was too large cmp ecx, MAX_MATCH jg SHORT trunc_same1 back_from_trunc1: long_match: mov edi, [esp + $matchpos_table] lea eax, [esi + 2] ; eax = BufPos-ptr+2 mov edx, [esp + $match_length] sub eax, ebx ; do ; { ; enc_matchpos_table[++match_length] = BufPos-ptr+2 ; } while (match_length < same); ; store match_length mov [esp + $match_length], ecx loop1: ; match_length++ inc edx ; enc_matchpos_table[match_length] = BufPos-ptr+2 mov [edi + edx*4], eax ; while (match_length < same) cmp edx, ecx jl SHORT loop1 ; if (same >= BREAK_LENGTH) cmp ecx, BREAK_LENGTH jge SHORT same1_ge_break_length ; same <= match_length same1_le_ml: ; clen = min(small_len, big_len=same) cmp [esp + $small_len], ecx ; big_len = same mov ebp, ecx ; small_len >= same? jge SHORT over1 ; no, small_len < same ; therefore clen := small_len ; (otherwise clen stays at big_len which ==same) mov ecx, [esp + $small_len] over1: mov [esp + $clen], ecx ; ; same <= big_len ; same_le_biglen: mov edx, [esp + $left] ; u mov eax, [esp + $big_ptr] ; v lea edi, [edx + ebx*4] ; u edi=&Left[ptr] mov [eax], ebx ; v *big_ptr=ptr mov [esp + $big_ptr], edi ; u big_ptr=&left[ptr] mov ecx, DWORD PTR [esp + $clen] ; v clen (next iter.) mov ebx, [edi] ; u ptr = *big_ptr mov edi, [esp + $mem_window] ; v (next iter.) ; bottom of main loop add edi, ecx ; u edi = &enc_MemWindow[clen] cmp ebx, [esp + $end_pos] ; v ; for next iteration mov eax, [edi + esi] ; u *(DWORD*) enc_MemWindow[b] (bufpos+clen) ja main_loop ; v jmp exit_main_loop trunc_same1: mov ecx, MAX_MATCH jmp SHORT back_from_trunc1 trunc_same2: mov ecx, MAX_MATCH jmp SHORT back_from_trunc2 ; ----------------------------------------- ; VAL > 0 ; ----------------------------------------- val_greater_than_0: ; if (same > small_len) cmp ecx, [esp + $small_len] jle SHORT same_le_smalllen ; if (same > match_length) cmp ecx, [esp + $match_length] jle SHORT same2_le_ml ; here's where we truncate ecx to MAX_MATCH if it ; was too large cmp ecx, MAX_MATCH jg SHORT trunc_same2 ; can trash clen ; ecx = BufPos-ptr+2 back_from_trunc2: mov edi, [esp + $matchpos_table] lea eax, [esi + 2] mov edx, [esp + $match_length] sub eax, ebx mov [esp + $match_length], ecx ; do ; { ; enc_matchpos_table[++match_length] = BufPos-ptr+2 ; } while (match_length < same); loop2: inc edx ; match_length++ ; enc_matchpos_table[match_length] = BufPos-ptr+2 mov [edi + edx*4], eax cmp edx, ecx jl SHORT loop2 ; if (same >= BREAK_LENGTH) cmp ecx, BREAK_LENGTH jge same2_ge_break_length same2_le_ml: mov edx, [esp + $small_len] ; clen = min(small_len=ecx, big_len) cmp ebp, ecx ; small_len = same mov [esp + $small_len], ecx jge SHORT over2 ; same = big_len mov ecx, ebp over2: mov [esp + $clen], ecx same_le_smalllen: mov edx, [esp + $right] mov eax, [esp + $small_ptr] lea edi, [edx + ebx*4] ; u edi = &Right[ptr] mov [eax], ebx ; v *small_ptr = ptr mov [esp + $small_ptr], edi ; u small_ptr = &right[ptr] mov ecx, [esp + $clen] ; v for next iteration mov ebx, [edi] ; u ptr = *small_ptr mov edi, [esp + $mem_window] ; v (next iter.) ; bottom of main loop add edi, ecx ; u (next iter.) cmp ebx, [esp + $end_pos] ; v mov eax, [edi + esi] ; u (next iter.) ja main_loop exit_main_loop: mov eax, [esp + $small_ptr] mov edx, [esp + $big_ptr] ; *small_ptr = 0 mov DWORD PTR [eax], 0 ; *big_ptr = 0 mov DWORD PTR [edx], 0 end_bsearch: ; ; now check for repeated offsets ; ; ; FIRST REPEATED OFFSET ; mov eax, [esp + $match_length] ; for (i = 0; i < match_length; i++) ; compare bufpos+i vs. bufpos+i-enc_last_matchpos_offset[0] mov edi, [esp + $mem_window] ; ebx = bufpos mov ebx, esi ; repeated offset zero ; ebx = bufpos - repeated_offset[0] mov ecx, [esp + $context] sub ebx, [ecx + OFF_LAST_MATCHPOS_OFFSET] ; i = 0 xor ecx, ecx rp1_loop: mov dl, [edi + esi] cmp dl, [edi + ebx] jne SHORT rp1_mismatch ; i++ inc ecx ; inc window pointer inc edi ; i < match_length? cmp ecx, eax jl SHORT rp1_loop ; ; i == match_length ; ; therefore force ourselves to take rp1 ; ; (this code is not in the C source, since it is ; messy to do) ; mov ebx, [esp + $matchpos_table] force_rp1_copy: mov DWORD PTR [ebx + ecx*4], 0 dec ecx cmp ecx, MIN_MATCH jge SHORT force_rp1_copy jmp boundary_check ; ; i < match_length ; rp1_mismatch: ; best_repeated_offset = i mov [esp + $best_repeat], ecx ; if (i >= MIN_MATCH) cmp ecx, MIN_MATCH jl SHORT try_rp2 ; for (; i >= MIN_MATCH; i--) ; enc_matchpos_table[i] = 0 mov ebx, [esp + $matchpos_table] rp1_copy: mov DWORD PTR [ebx + ecx*4], 0 dec ecx cmp ecx, MIN_MATCH jge SHORT rp1_copy ; quick check cmp DWORD PTR [esp + $best_repeat], BREAK_LENGTH jg boundary_check ; ; SECOND REPEATED OFFSET ; try_rp2: ; for (i = 0; i < match_length; i++) ; compare bufpos+i vs. bufpos+i-enc_last_matchpos_offset[1] mov edi, [esp + $mem_window] ; ebx = bufpos mov ebx, esi ; repeated offset zero ; ebx = bufpos - repeated_offset[1] mov ecx, [esp + $context] sub ebx, [ecx + OFF_LAST_MATCHPOS_OFFSET + 4] ; i = 0 xor ecx, ecx rp2_loop: mov dl, [edi + esi] cmp dl, [edi + ebx] jne SHORT rp2_mismatch ; i++ inc ecx ; inc window pointer inc edi ; i < match_length? cmp ecx, eax jl SHORT rp2_loop ; ; i == match_length ; ; therefore force ourselves to take rp2 ; ; (this code is not in the C source, since it is ; messy to do) ; mov ebx, [esp + $matchpos_table] force_rp2_copy: mov DWORD PTR [ebx + ecx*4], 1 dec ecx cmp ecx, MIN_MATCH jge SHORT force_rp2_copy jmp SHORT boundary_check rp2_mismatch: ; if (i > best_repeated_offset) cmp ecx, [esp + $best_repeat] jle SHORT try_rp3 ; do ; enc_matchpos_table[++best_repeated_offset] = 1 ; while (best_repeated_offset < i) mov edi, [esp + $best_repeat] mov ebx, [esp + $matchpos_table] rp2_copy: inc edi ; ++best_repeated_offset mov DWORD PTR [ebx + edi*4], 1 cmp edi, ecx ; best_repeated_offset < i ? jl SHORT rp2_copy ; best_repeat = i mov [esp + $best_repeat], ecx ; ; THIRD REPEATED OFFSET ; try_rp3: ; for (i = 0; i < match_length; i++) ; compare bufpos+i vs. bufpos+i-enc_last_matchpos_offset[2] mov edi, [esp + $mem_window] ; ebx = bufpos mov ebx, esi ; repeated offset zero ; ebx = bufpos - repeated_offset[2] mov ecx, [esp + $context] sub ebx, [ecx + OFF_LAST_MATCHPOS_OFFSET + 8] ; i = 0 xor ecx, ecx rp3_loop: mov dl, [edi + esi] cmp dl, [edi + ebx] jne SHORT rp3_mismatch ; i++ inc ecx ; inc window pointer inc edi ; i < match_length? cmp ecx, eax jl SHORT rp3_loop ; ; i == match_length ; ; therefore force ourselves to take rp3 ; ; (this code is not in the C source, since it is ; messy to do) ; mov ebx, [esp + $matchpos_table] force_rp3_copy: mov DWORD PTR [ebx + ecx*4], 2 dec ecx cmp ecx, MIN_MATCH jge SHORT force_rp3_copy jmp SHORT boundary_check rp3_mismatch: ; if (i > best_repeated_offset) cmp ecx, [esp + $best_repeat] jle SHORT boundary_check ; do ; enc_matchpos_table[++best_repeated_offset] = 2 ; while (best_repeated_offset < i) mov edi, [esp + $best_repeat] mov ebx, [esp + $matchpos_table] rp3_copy: inc edi ; ++best_repeated_offset mov DWORD PTR [ebx + edi*4], 2 cmp edi, ecx ; best_repeated_offset < i ? jl SHORT rp3_copy ; ; Check that our match length does not cause us ; to cross a 32K boundary, and truncate if necessary. ; ; bytes_to_boundary = 32767 - (BufPos & 32767) boundary_check: mov edx, 32767 and esi, 32767 mov eax, [esp + $match_length] sub edx, esi ; edx = 32767 - (BufPos & 32767) ; ; if (matchlength <= bytes_to_boundary) ; then we're ok ; cmp eax, edx jle SHORT does_not_cross ; ; otherwise we have to truncate the match ; mov eax, edx ; ; if we truncate the match, does it become ; smaller than MIN_MATCH? ; cmp edx, MIN_MATCH jge SHORT ge_min_match ; ; yes, so we return that no matches at all ; were found ; xor eax, eax ge_min_match: does_not_cross: ; ; return our match length in eax ; cleanup: add esp, LOCAL_STACK pop ebp pop edi pop esi pop edx pop ecx pop ebx ret 0 ; ; ptr <= endpos ; ptr_le_endpos: ; ; left[BufPos] = right[BufPos] = 0 ; xor eax, eax ; return match length zero mov ecx, [ebp + OFF_LEFT] mov edx, [ebp + OFF_RIGHT] mov [ecx + esi*4], eax mov [edx + esi*4], eax ; cleanup add esp, LOCAL_STACK pop ebp pop edi pop esi pop edx pop ecx pop ebx ret 0 _binary_search_findmatch ENDP _TEXT ENDS END