Fossil SCM

fossil-scm / compat / zlib / contrib / gcc_gvmat64 / gvmat64.S
Source Blame History 570 lines
7ef7284… drh 1 /*
7ef7284… drh 2 ;uInt longest_match_x64(
7ef7284… drh 3 ; deflate_state *s,
7ef7284… drh 4 ; IPos cur_match); // current match
7ef7284… drh 5
7ef7284… drh 6 ; gvmat64.S -- Asm portion of the optimized longest_match for 32 bits x86_64
7ef7284… drh 7 ; (AMD64 on Athlon 64, Opteron, Phenom
7ef7284… drh 8 ; and Intel EM64T on Pentium 4 with EM64T, Pentium D, Core 2 Duo, Core I5/I7)
7ef7284… drh 9 ; this file is translation from gvmat64.asm to GCC 4.x (for Linux, Mac XCode)
7ef7284… drh 10 ; Copyright (C) 1995-2010 Jean-loup Gailly, Brian Raiter and Gilles Vollant.
7ef7284… drh 11 ;
7ef7284… drh 12 ; File written by Gilles Vollant, by converting to assembly the longest_match
7ef7284… drh 13 ; from Jean-loup Gailly in deflate.c of zLib and infoZip zip.
7ef7284… drh 14 ; and by taking inspiration on asm686 with masm, optimised assembly code
7ef7284… drh 15 ; from Brian Raiter, written 1998
7ef7284… drh 16 ;
7ef7284… drh 17 ; This software is provided 'as-is', without any express or implied
7ef7284… drh 18 ; warranty. In no event will the authors be held liable for any damages
7ef7284… drh 19 ; arising from the use of this software.
7ef7284… drh 20 ;
7ef7284… drh 21 ; Permission is granted to anyone to use this software for any purpose,
7ef7284… drh 22 ; including commercial applications, and to alter it and redistribute it
7ef7284… drh 23 ; freely, subject to the following restrictions:
7ef7284… drh 24 ;
7ef7284… drh 25 ; 1. The origin of this software must not be misrepresented; you must not
7ef7284… drh 26 ; claim that you wrote the original software. If you use this software
7ef7284… drh 27 ; in a product, an acknowledgment in the product documentation would be
7ef7284… drh 28 ; appreciated but is not required.
7ef7284… drh 29 ; 2. Altered source versions must be plainly marked as such, and must not be
7ef7284… drh 30 ; misrepresented as being the original software
7ef7284… drh 31 ; 3. This notice may not be removed or altered from any source distribution.
7ef7284… drh 32 ;
6ea30fb… florian 33 ; https://www.zlib.net
6ea30fb… florian 34 ; https://www.muppetlabs.com/~breadbox/software/assembly.html
7ef7284… drh 35 ;
7ef7284… drh 36 ; to compile this file for zLib, I use option:
7ef7284… drh 37 ; gcc -c -arch x86_64 gvmat64.S
7ef7284… drh 38
7ef7284… drh 39
7ef7284… drh 40 ;uInt longest_match(s, cur_match)
7ef7284… drh 41 ; deflate_state *s;
7ef7284… drh 42 ; IPos cur_match; // current match /
7ef7284… drh 43 ;
7ef7284… drh 44 ; with XCode for Mac, I had strange error with some jump on intel syntax
7ef7284… drh 45 ; this is why BEFORE_JMP and AFTER_JMP are used
7ef7284… drh 46 */
7ef7284… drh 47
7ef7284… drh 48
7ef7284… drh 49 #define BEFORE_JMP .att_syntax
7ef7284… drh 50 #define AFTER_JMP .intel_syntax noprefix
7ef7284… drh 51
7ef7284… drh 52 #ifndef NO_UNDERLINE
7ef7284… drh 53 # define match_init _match_init
7ef7284… drh 54 # define longest_match _longest_match
7ef7284… drh 55 #endif
7ef7284… drh 56
7ef7284… drh 57 .intel_syntax noprefix
7ef7284… drh 58
7ef7284… drh 59 .globl match_init, longest_match
7ef7284… drh 60 .text
7ef7284… drh 61 longest_match:
7ef7284… drh 62
7ef7284… drh 63
7ef7284… drh 64
7ef7284… drh 65 #define LocalVarsSize 96
7ef7284… drh 66 /*
7ef7284… drh 67 ; register used : rax,rbx,rcx,rdx,rsi,rdi,r8,r9,r10,r11,r12
7ef7284… drh 68 ; free register : r14,r15
7ef7284… drh 69 ; register can be saved : rsp
7ef7284… drh 70 */
7ef7284… drh 71
7ef7284… drh 72 #define chainlenwmask (rsp + 8 - LocalVarsSize)
7ef7284… drh 73 #define nicematch (rsp + 16 - LocalVarsSize)
7ef7284… drh 74
7ef7284… drh 75 #define save_rdi (rsp + 24 - LocalVarsSize)
7ef7284… drh 76 #define save_rsi (rsp + 32 - LocalVarsSize)
7ef7284… drh 77 #define save_rbx (rsp + 40 - LocalVarsSize)
7ef7284… drh 78 #define save_rbp (rsp + 48 - LocalVarsSize)
7ef7284… drh 79 #define save_r12 (rsp + 56 - LocalVarsSize)
7ef7284… drh 80 #define save_r13 (rsp + 64 - LocalVarsSize)
7ef7284… drh 81 #define save_r14 (rsp + 72 - LocalVarsSize)
7ef7284… drh 82 #define save_r15 (rsp + 80 - LocalVarsSize)
7ef7284… drh 83
7ef7284… drh 84
7ef7284… drh 85 /*
7ef7284… drh 86 ; all the +4 offsets are due to the addition of pending_buf_size (in zlib
7ef7284… drh 87 ; in the deflate_state structure since the asm code was first written
7ef7284… drh 88 ; (if you compile with zlib 1.0.4 or older, remove the +4).
7ef7284… drh 89 ; Note : these value are good with a 8 bytes boundary pack structure
7ef7284… drh 90 */
7ef7284… drh 91
7ef7284… drh 92 #define MAX_MATCH 258
7ef7284… drh 93 #define MIN_MATCH 3
7ef7284… drh 94 #define MIN_LOOKAHEAD (MAX_MATCH+MIN_MATCH+1)
7ef7284… drh 95
7ef7284… drh 96 /*
7ef7284… drh 97 ;;; Offsets for fields in the deflate_state structure. These numbers
7ef7284… drh 98 ;;; are calculated from the definition of deflate_state, with the
7ef7284… drh 99 ;;; assumption that the compiler will dword-align the fields. (Thus,
7ef7284… drh 100 ;;; changing the definition of deflate_state could easily cause this
7ef7284… drh 101 ;;; program to crash horribly, without so much as a warning at
7ef7284… drh 102 ;;; compile time. Sigh.)
7ef7284… drh 103
7ef7284… drh 104 ; all the +zlib1222add offsets are due to the addition of fields
7ef7284… drh 105 ; in zlib in the deflate_state structure since the asm code was first written
7ef7284… drh 106 ; (if you compile with zlib 1.0.4 or older, use "zlib1222add equ (-4)").
7ef7284… drh 107 ; (if you compile with zlib between 1.0.5 and 1.2.2.1, use "zlib1222add equ 0").
7ef7284… drh 108 ; if you compile with zlib 1.2.2.2 or later , use "zlib1222add equ 8").
7ef7284… drh 109 */
7ef7284… drh 110
7ef7284… drh 111
7ef7284… drh 112
7ef7284… drh 113 /* you can check the structure offset by running
7ef7284… drh 114
7ef7284… drh 115 #include <stdlib.h>
7ef7284… drh 116 #include <stdio.h>
7ef7284… drh 117 #include "deflate.h"
7ef7284… drh 118
7ef7284… drh 119 void print_depl()
7ef7284… drh 120 {
7ef7284… drh 121 deflate_state ds;
7ef7284… drh 122 deflate_state *s=&ds;
7ef7284… drh 123 printf("size pointer=%u\n",(int)sizeof(void*));
7ef7284… drh 124
7ef7284… drh 125 printf("#define dsWSize %u\n",(int)(((char*)&(s->w_size))-((char*)s)));
7ef7284… drh 126 printf("#define dsWMask %u\n",(int)(((char*)&(s->w_mask))-((char*)s)));
7ef7284… drh 127 printf("#define dsWindow %u\n",(int)(((char*)&(s->window))-((char*)s)));
7ef7284… drh 128 printf("#define dsPrev %u\n",(int)(((char*)&(s->prev))-((char*)s)));
7ef7284… drh 129 printf("#define dsMatchLen %u\n",(int)(((char*)&(s->match_length))-((char*)s)));
7ef7284… drh 130 printf("#define dsPrevMatch %u\n",(int)(((char*)&(s->prev_match))-((char*)s)));
7ef7284… drh 131 printf("#define dsStrStart %u\n",(int)(((char*)&(s->strstart))-((char*)s)));
7ef7284… drh 132 printf("#define dsMatchStart %u\n",(int)(((char*)&(s->match_start))-((char*)s)));
7ef7284… drh 133 printf("#define dsLookahead %u\n",(int)(((char*)&(s->lookahead))-((char*)s)));
7ef7284… drh 134 printf("#define dsPrevLen %u\n",(int)(((char*)&(s->prev_length))-((char*)s)));
7ef7284… drh 135 printf("#define dsMaxChainLen %u\n",(int)(((char*)&(s->max_chain_length))-((char*)s)));
7ef7284… drh 136 printf("#define dsGoodMatch %u\n",(int)(((char*)&(s->good_match))-((char*)s)));
7ef7284… drh 137 printf("#define dsNiceMatch %u\n",(int)(((char*)&(s->nice_match))-((char*)s)));
7ef7284… drh 138 }
7ef7284… drh 139 */
7ef7284… drh 140
7ef7284… drh 141 #define dsWSize 68
7ef7284… drh 142 #define dsWMask 76
7ef7284… drh 143 #define dsWindow 80
7ef7284… drh 144 #define dsPrev 96
7ef7284… drh 145 #define dsMatchLen 144
7ef7284… drh 146 #define dsPrevMatch 148
7ef7284… drh 147 #define dsStrStart 156
7ef7284… drh 148 #define dsMatchStart 160
7ef7284… drh 149 #define dsLookahead 164
7ef7284… drh 150 #define dsPrevLen 168
7ef7284… drh 151 #define dsMaxChainLen 172
7ef7284… drh 152 #define dsGoodMatch 188
7ef7284… drh 153 #define dsNiceMatch 192
7ef7284… drh 154
7ef7284… drh 155 #define window_size [ rcx + dsWSize]
7ef7284… drh 156 #define WMask [ rcx + dsWMask]
7ef7284… drh 157 #define window_ad [ rcx + dsWindow]
7ef7284… drh 158 #define prev_ad [ rcx + dsPrev]
7ef7284… drh 159 #define strstart [ rcx + dsStrStart]
7ef7284… drh 160 #define match_start [ rcx + dsMatchStart]
7ef7284… drh 161 #define Lookahead [ rcx + dsLookahead] //; 0ffffffffh on infozip
7ef7284… drh 162 #define prev_length [ rcx + dsPrevLen]
7ef7284… drh 163 #define max_chain_length [ rcx + dsMaxChainLen]
7ef7284… drh 164 #define good_match [ rcx + dsGoodMatch]
7ef7284… drh 165 #define nice_match [ rcx + dsNiceMatch]
7ef7284… drh 166
7ef7284… drh 167 /*
7ef7284… drh 168 ; windows:
7ef7284… drh 169 ; parameter 1 in rcx(deflate state s), param 2 in rdx (cur match)
7ef7284… drh 170
7ef7284… drh 171 ; All registers must be preserved across the call, except for
7ef7284… drh 172 ; rax, rcx, rdx, r8, r9, r10, and r11, which are scratch.
7ef7284… drh 173
7ef7284… drh 174 ;
7ef7284… drh 175 ; gcc on macosx-linux:
6ea30fb… florian 176 ; see https://refspecs.linuxbase.org/elf/x86_64-abi-0.99.pdf
7ef7284… drh 177 ; param 1 in rdi, param 2 in rsi
7ef7284… drh 178 ; rbx, rsp, rbp, r12 to r15 must be preserved
7ef7284… drh 179
7ef7284… drh 180 ;;; Save registers that the compiler may be using, and adjust esp to
7ef7284… drh 181 ;;; make room for our stack frame.
7ef7284… drh 182
7ef7284… drh 183
7ef7284… drh 184 ;;; Retrieve the function arguments. r8d will hold cur_match
7ef7284… drh 185 ;;; throughout the entire function. edx will hold the pointer to the
7ef7284… drh 186 ;;; deflate_state structure during the function's setup (before
7ef7284… drh 187 ;;; entering the main loop.
7ef7284… drh 188
7ef7284… drh 189 ; ms: parameter 1 in rcx (deflate_state* s), param 2 in edx -> r8 (cur match)
7ef7284… drh 190 ; mac: param 1 in rdi, param 2 rsi
7ef7284… drh 191 ; this clear high 32 bits of r8, which can be garbage in both r8 and rdx
7ef7284… drh 192 */
7ef7284… drh 193 mov [save_rbx],rbx
7ef7284… drh 194 mov [save_rbp],rbp
7ef7284… drh 195
7ef7284… drh 196
7ef7284… drh 197 mov rcx,rdi
7ef7284… drh 198
7ef7284… drh 199 mov r8d,esi
7ef7284… drh 200
7ef7284… drh 201
7ef7284… drh 202 mov [save_r12],r12
7ef7284… drh 203 mov [save_r13],r13
7ef7284… drh 204 mov [save_r14],r14
7ef7284… drh 205 mov [save_r15],r15
7ef7284… drh 206
7ef7284… drh 207
7ef7284… drh 208 //;;; uInt wmask = s->w_mask;
7ef7284… drh 209 //;;; unsigned chain_length = s->max_chain_length;
7ef7284… drh 210 //;;; if (s->prev_length >= s->good_match) {
7ef7284… drh 211 //;;; chain_length >>= 2;
7ef7284… drh 212 //;;; }
7ef7284… drh 213
7ef7284… drh 214
7ef7284… drh 215 mov edi, prev_length
7ef7284… drh 216 mov esi, good_match
7ef7284… drh 217 mov eax, WMask
7ef7284… drh 218 mov ebx, max_chain_length
7ef7284… drh 219 cmp edi, esi
7ef7284… drh 220 jl LastMatchGood
7ef7284… drh 221 shr ebx, 2
7ef7284… drh 222 LastMatchGood:
7ef7284… drh 223
7ef7284… drh 224 //;;; chainlen is decremented once beforehand so that the function can
7ef7284… drh 225 //;;; use the sign flag instead of the zero flag for the exit test.
7ef7284… drh 226 //;;; It is then shifted into the high word, to make room for the wmask
7ef7284… drh 227 //;;; value, which it will always accompany.
7ef7284… drh 228
7ef7284… drh 229 dec ebx
7ef7284… drh 230 shl ebx, 16
7ef7284… drh 231 or ebx, eax
7ef7284… drh 232
7ef7284… drh 233 //;;; on zlib only
7ef7284… drh 234 //;;; if ((uInt)nice_match > s->lookahead) nice_match = s->lookahead;
7ef7284… drh 235
7ef7284… drh 236
7ef7284… drh 237
7ef7284… drh 238 mov eax, nice_match
7ef7284… drh 239 mov [chainlenwmask], ebx
7ef7284… drh 240 mov r10d, Lookahead
7ef7284… drh 241 cmp r10d, eax
7ef7284… drh 242 cmovnl r10d, eax
7ef7284… drh 243 mov [nicematch],r10d
7ef7284… drh 244
7ef7284… drh 245
7ef7284… drh 246
7ef7284… drh 247 //;;; register Bytef *scan = s->window + s->strstart;
7ef7284… drh 248 mov r10, window_ad
7ef7284… drh 249 mov ebp, strstart
7ef7284… drh 250 lea r13, [r10 + rbp]
7ef7284… drh 251
7ef7284… drh 252 //;;; Determine how many bytes the scan ptr is off from being
7ef7284… drh 253 //;;; dword-aligned.
7ef7284… drh 254
7ef7284… drh 255 mov r9,r13
7ef7284… drh 256 neg r13
7ef7284… drh 257 and r13,3
7ef7284… drh 258
7ef7284… drh 259 //;;; IPos limit = s->strstart > (IPos)MAX_DIST(s) ?
7ef7284… drh 260 //;;; s->strstart - (IPos)MAX_DIST(s) : NIL;
7ef7284… drh 261
7ef7284… drh 262
7ef7284… drh 263 mov eax, window_size
7ef7284… drh 264 sub eax, MIN_LOOKAHEAD
7ef7284… drh 265
7ef7284… drh 266
7ef7284… drh 267 xor edi,edi
7ef7284… drh 268 sub ebp, eax
7ef7284… drh 269
7ef7284… drh 270 mov r11d, prev_length
7ef7284… drh 271
7ef7284… drh 272 cmovng ebp,edi
7ef7284… drh 273
7ef7284… drh 274 //;;; int best_len = s->prev_length;
7ef7284… drh 275
7ef7284… drh 276
7ef7284… drh 277 //;;; Store the sum of s->window + best_len in esi locally, and in esi.
7ef7284… drh 278
7ef7284… drh 279 lea rsi,[r10+r11]
7ef7284… drh 280
7ef7284… drh 281 //;;; register ush scan_start = *(ushf*)scan;
7ef7284… drh 282 //;;; register ush scan_end = *(ushf*)(scan+best_len-1);
7ef7284… drh 283 //;;; Posf *prev = s->prev;
7ef7284… drh 284
7ef7284… drh 285 movzx r12d,word ptr [r9]
7ef7284… drh 286 movzx ebx, word ptr [r9 + r11 - 1]
7ef7284… drh 287
7ef7284… drh 288 mov rdi, prev_ad
7ef7284… drh 289
7ef7284… drh 290 //;;; Jump into the main loop.
7ef7284… drh 291
7ef7284… drh 292 mov edx, [chainlenwmask]
7ef7284… drh 293
7ef7284… drh 294 cmp bx,word ptr [rsi + r8 - 1]
7ef7284… drh 295 jz LookupLoopIsZero
7ef7284… drh 296
7ef7284… drh 297
7ef7284… drh 298
7ef7284… drh 299 LookupLoop1:
7ef7284… drh 300 and r8d, edx
7ef7284… drh 301
7ef7284… drh 302 movzx r8d, word ptr [rdi + r8*2]
7ef7284… drh 303 cmp r8d, ebp
7ef7284… drh 304 jbe LeaveNow
7ef7284… drh 305
7ef7284… drh 306
7ef7284… drh 307
7ef7284… drh 308 sub edx, 0x00010000
7ef7284… drh 309 BEFORE_JMP
7ef7284… drh 310 js LeaveNow
7ef7284… drh 311 AFTER_JMP
7ef7284… drh 312
7ef7284… drh 313 LoopEntry1:
7ef7284… drh 314 cmp bx,word ptr [rsi + r8 - 1]
7ef7284… drh 315 BEFORE_JMP
7ef7284… drh 316 jz LookupLoopIsZero
7ef7284… drh 317 AFTER_JMP
7ef7284… drh 318
7ef7284… drh 319 LookupLoop2:
7ef7284… drh 320 and r8d, edx
7ef7284… drh 321
7ef7284… drh 322 movzx r8d, word ptr [rdi + r8*2]
7ef7284… drh 323 cmp r8d, ebp
7ef7284… drh 324 BEFORE_JMP
7ef7284… drh 325 jbe LeaveNow
7ef7284… drh 326 AFTER_JMP
7ef7284… drh 327 sub edx, 0x00010000
7ef7284… drh 328 BEFORE_JMP
7ef7284… drh 329 js LeaveNow
7ef7284… drh 330 AFTER_JMP
7ef7284… drh 331
7ef7284… drh 332 LoopEntry2:
7ef7284… drh 333 cmp bx,word ptr [rsi + r8 - 1]
7ef7284… drh 334 BEFORE_JMP
7ef7284… drh 335 jz LookupLoopIsZero
7ef7284… drh 336 AFTER_JMP
7ef7284… drh 337
7ef7284… drh 338 LookupLoop4:
7ef7284… drh 339 and r8d, edx
7ef7284… drh 340
7ef7284… drh 341 movzx r8d, word ptr [rdi + r8*2]
7ef7284… drh 342 cmp r8d, ebp
7ef7284… drh 343 BEFORE_JMP
7ef7284… drh 344 jbe LeaveNow
7ef7284… drh 345 AFTER_JMP
7ef7284… drh 346 sub edx, 0x00010000
7ef7284… drh 347 BEFORE_JMP
7ef7284… drh 348 js LeaveNow
7ef7284… drh 349 AFTER_JMP
7ef7284… drh 350
7ef7284… drh 351 LoopEntry4:
7ef7284… drh 352
7ef7284… drh 353 cmp bx,word ptr [rsi + r8 - 1]
7ef7284… drh 354 BEFORE_JMP
7ef7284… drh 355 jnz LookupLoop1
7ef7284… drh 356 jmp LookupLoopIsZero
7ef7284… drh 357 AFTER_JMP
7ef7284… drh 358 /*
7ef7284… drh 359 ;;; do {
7ef7284… drh 360 ;;; match = s->window + cur_match;
7ef7284… drh 361 ;;; if (*(ushf*)(match+best_len-1) != scan_end ||
7ef7284… drh 362 ;;; *(ushf*)match != scan_start) continue;
7ef7284… drh 363 ;;; [...]
7ef7284… drh 364 ;;; } while ((cur_match = prev[cur_match & wmask]) > limit
7ef7284… drh 365 ;;; && --chain_length != 0);
7ef7284… drh 366 ;;;
7ef7284… drh 367 ;;; Here is the inner loop of the function. The function will spend the
7ef7284… drh 368 ;;; majority of its time in this loop, and majority of that time will
7ef7284… drh 369 ;;; be spent in the first ten instructions.
7ef7284… drh 370 ;;;
7ef7284… drh 371 ;;; Within this loop:
7ef7284… drh 372 ;;; ebx = scanend
7ef7284… drh 373 ;;; r8d = curmatch
7ef7284… drh 374 ;;; edx = chainlenwmask - i.e., ((chainlen << 16) | wmask)
7ef7284… drh 375 ;;; esi = windowbestlen - i.e., (window + bestlen)
7ef7284… drh 376 ;;; edi = prev
7ef7284… drh 377 ;;; ebp = limit
7ef7284… drh 378 */
7ef7284… drh 379 .balign 16
7ef7284… drh 380 LookupLoop:
7ef7284… drh 381 and r8d, edx
7ef7284… drh 382
7ef7284… drh 383 movzx r8d, word ptr [rdi + r8*2]
7ef7284… drh 384 cmp r8d, ebp
7ef7284… drh 385 BEFORE_JMP
7ef7284… drh 386 jbe LeaveNow
7ef7284… drh 387 AFTER_JMP
7ef7284… drh 388 sub edx, 0x00010000
7ef7284… drh 389 BEFORE_JMP
7ef7284… drh 390 js LeaveNow
7ef7284… drh 391 AFTER_JMP
7ef7284… drh 392
7ef7284… drh 393 LoopEntry:
7ef7284… drh 394
7ef7284… drh 395 cmp bx,word ptr [rsi + r8 - 1]
7ef7284… drh 396 BEFORE_JMP
7ef7284… drh 397 jnz LookupLoop1
7ef7284… drh 398 AFTER_JMP
7ef7284… drh 399 LookupLoopIsZero:
7ef7284… drh 400 cmp r12w, word ptr [r10 + r8]
7ef7284… drh 401 BEFORE_JMP
7ef7284… drh 402 jnz LookupLoop1
7ef7284… drh 403 AFTER_JMP
7ef7284… drh 404
7ef7284… drh 405
7ef7284… drh 406 //;;; Store the current value of chainlen.
7ef7284… drh 407 mov [chainlenwmask], edx
7ef7284… drh 408 /*
7ef7284… drh 409 ;;; Point edi to the string under scrutiny, and esi to the string we
7ef7284… drh 410 ;;; are hoping to match it up with. In actuality, esi and edi are
7ef7284… drh 411 ;;; both pointed (MAX_MATCH_8 - scanalign) bytes ahead, and edx is
7ef7284… drh 412 ;;; initialized to -(MAX_MATCH_8 - scanalign).
7ef7284… drh 413 */
7ef7284… drh 414 lea rsi,[r8+r10]
7ef7284… drh 415 mov rdx, 0xfffffffffffffef8 //; -(MAX_MATCH_8)
7ef7284… drh 416 lea rsi, [rsi + r13 + 0x0108] //;MAX_MATCH_8]
7ef7284… drh 417 lea rdi, [r9 + r13 + 0x0108] //;MAX_MATCH_8]
7ef7284… drh 418
7ef7284… drh 419 prefetcht1 [rsi+rdx]
7ef7284… drh 420 prefetcht1 [rdi+rdx]
7ef7284… drh 421
7ef7284… drh 422 /*
7ef7284… drh 423 ;;; Test the strings for equality, 8 bytes at a time. At the end,
7ef7284… drh 424 ;;; adjust rdx so that it is offset to the exact byte that mismatched.
7ef7284… drh 425 ;;;
7ef7284… drh 426 ;;; We already know at this point that the first three bytes of the
7ef7284… drh 427 ;;; strings match each other, and they can be safely passed over before
7ef7284… drh 428 ;;; starting the compare loop. So what this code does is skip over 0-3
7ef7284… drh 429 ;;; bytes, as much as necessary in order to dword-align the edi
7ef7284… drh 430 ;;; pointer. (rsi will still be misaligned three times out of four.)
7ef7284… drh 431 ;;;
7ef7284… drh 432 ;;; It should be confessed that this loop usually does not represent
7ef7284… drh 433 ;;; much of the total running time. Replacing it with a more
7ef7284… drh 434 ;;; straightforward "rep cmpsb" would not drastically degrade
7ef7284… drh 435 ;;; performance.
7ef7284… drh 436 */
7ef7284… drh 437
7ef7284… drh 438 LoopCmps:
7ef7284… drh 439 mov rax, [rsi + rdx]
7ef7284… drh 440 xor rax, [rdi + rdx]
7ef7284… drh 441 jnz LeaveLoopCmps
7ef7284… drh 442
7ef7284… drh 443 mov rax, [rsi + rdx + 8]
7ef7284… drh 444 xor rax, [rdi + rdx + 8]
7ef7284… drh 445 jnz LeaveLoopCmps8
7ef7284… drh 446
7ef7284… drh 447
7ef7284… drh 448 mov rax, [rsi + rdx + 8+8]
7ef7284… drh 449 xor rax, [rdi + rdx + 8+8]
7ef7284… drh 450 jnz LeaveLoopCmps16
7ef7284… drh 451
7ef7284… drh 452 add rdx,8+8+8
7ef7284… drh 453
7ef7284… drh 454 BEFORE_JMP
7ef7284… drh 455 jnz LoopCmps
7ef7284… drh 456 jmp LenMaximum
7ef7284… drh 457 AFTER_JMP
7ef7284… drh 458
7ef7284… drh 459 LeaveLoopCmps16: add rdx,8
7ef7284… drh 460 LeaveLoopCmps8: add rdx,8
7ef7284… drh 461 LeaveLoopCmps:
7ef7284… drh 462
7ef7284… drh 463 test eax, 0x0000FFFF
7ef7284… drh 464 jnz LenLower
7ef7284… drh 465
7ef7284… drh 466 test eax,0xffffffff
7ef7284… drh 467
7ef7284… drh 468 jnz LenLower32
7ef7284… drh 469
7ef7284… drh 470 add rdx,4
7ef7284… drh 471 shr rax,32
7ef7284… drh 472 or ax,ax
7ef7284… drh 473 BEFORE_JMP
7ef7284… drh 474 jnz LenLower
7ef7284… drh 475 AFTER_JMP
7ef7284… drh 476
7ef7284… drh 477 LenLower32:
7ef7284… drh 478 shr eax,16
7ef7284… drh 479 add rdx,2
7ef7284… drh 480
7ef7284… drh 481 LenLower:
7ef7284… drh 482 sub al, 1
7ef7284… drh 483 adc rdx, 0
7ef7284… drh 484 //;;; Calculate the length of the match. If it is longer than MAX_MATCH,
7ef7284… drh 485 //;;; then automatically accept it as the best possible match and leave.
7ef7284… drh 486
7ef7284… drh 487 lea rax, [rdi + rdx]
7ef7284… drh 488 sub rax, r9
7ef7284… drh 489 cmp eax, MAX_MATCH
7ef7284… drh 490 BEFORE_JMP
7ef7284… drh 491 jge LenMaximum
7ef7284… drh 492 AFTER_JMP
7ef7284… drh 493 /*
7ef7284… drh 494 ;;; If the length of the match is not longer than the best match we
7ef7284… drh 495 ;;; have so far, then forget it and return to the lookup loop.
7ef7284… drh 496 ;///////////////////////////////////
7ef7284… drh 497 */
7ef7284… drh 498 cmp eax, r11d
7ef7284… drh 499 jg LongerMatch
7ef7284… drh 500
7ef7284… drh 501 lea rsi,[r10+r11]
7ef7284… drh 502
7ef7284… drh 503 mov rdi, prev_ad
7ef7284… drh 504 mov edx, [chainlenwmask]
7ef7284… drh 505 BEFORE_JMP
7ef7284… drh 506 jmp LookupLoop
7ef7284… drh 507 AFTER_JMP
7ef7284… drh 508 /*
7ef7284… drh 509 ;;; s->match_start = cur_match;
7ef7284… drh 510 ;;; best_len = len;
7ef7284… drh 511 ;;; if (len >= nice_match) break;
7ef7284… drh 512 ;;; scan_end = *(ushf*)(scan+best_len-1);
7ef7284… drh 513 */
7ef7284… drh 514 LongerMatch:
7ef7284… drh 515 mov r11d, eax
7ef7284… drh 516 mov match_start, r8d
7ef7284… drh 517 cmp eax, [nicematch]
7ef7284… drh 518 BEFORE_JMP
7ef7284… drh 519 jge LeaveNow
7ef7284… drh 520 AFTER_JMP
7ef7284… drh 521
7ef7284… drh 522 lea rsi,[r10+rax]
7ef7284… drh 523
7ef7284… drh 524 movzx ebx, word ptr [r9 + rax - 1]
7ef7284… drh 525 mov rdi, prev_ad
7ef7284… drh 526 mov edx, [chainlenwmask]
7ef7284… drh 527 BEFORE_JMP
7ef7284… drh 528 jmp LookupLoop
7ef7284… drh 529 AFTER_JMP
7ef7284… drh 530
7ef7284… drh 531 //;;; Accept the current string, with the maximum possible length.
7ef7284… drh 532
7ef7284… drh 533 LenMaximum:
7ef7284… drh 534 mov r11d,MAX_MATCH
7ef7284… drh 535 mov match_start, r8d
7ef7284… drh 536
7ef7284… drh 537 //;;; if ((uInt)best_len <= s->lookahead) return (uInt)best_len;
7ef7284… drh 538 //;;; return s->lookahead;
7ef7284… drh 539
7ef7284… drh 540 LeaveNow:
7ef7284… drh 541 mov eax, Lookahead
7ef7284… drh 542 cmp r11d, eax
7ef7284… drh 543 cmovng eax, r11d
7ef7284… drh 544
7ef7284… drh 545
7ef7284… drh 546
7ef7284… drh 547 //;;; Restore the stack and return from whence we came.
7ef7284… drh 548
7ef7284… drh 549
7ef7284… drh 550 // mov rsi,[save_rsi]
7ef7284… drh 551 // mov rdi,[save_rdi]
7ef7284… drh 552 mov rbx,[save_rbx]
7ef7284… drh 553 mov rbp,[save_rbp]
7ef7284… drh 554 mov r12,[save_r12]
7ef7284… drh 555 mov r13,[save_r13]
7ef7284… drh 556 mov r14,[save_r14]
7ef7284… drh 557 mov r15,[save_r15]
7ef7284… drh 558
7ef7284… drh 559
7ef7284… drh 560 ret 0
7ef7284… drh 561 //; please don't remove this string !
7ef7284… drh 562 //; Your can freely use gvmat64 in any free or commercial app
7ef7284… drh 563 //; but it is far better don't remove the string in the binary!
7ef7284… drh 564 // db 0dh,0ah,"asm686 with masm, optimised assembly code from Brian Raiter, written 1998, converted to amd 64 by Gilles Vollant 2005",0dh,0ah,0
7ef7284… drh 565
7ef7284… drh 566
7ef7284… drh 567 match_init:
7ef7284… drh 568 ret 0
7ef7284… drh 569
7ef7284… drh 570

Keyboard Shortcuts

Open search /
Next entry (timeline) j
Previous entry (timeline) k
Open focused entry Enter
Show this help ?
Toggle theme Top nav button