Fossil SCM
Improved robot detection logic.
Commit
61a8b0ecadbbdeaf40667fe69dddb84ad4558f30e393075c5e8d1636d15cb969
Parent
92f2a04d3780cf2…
1 file changed
+9
-26
+9
-26
| --- src/login.c | ||
| +++ src/login.c | ||
| @@ -394,50 +394,33 @@ | ||
| 394 | 394 | cgi_replace_parameter(cookie, NULL); |
| 395 | 395 | cgi_replace_parameter("anon", NULL); |
| 396 | 396 | } |
| 397 | 397 | } |
| 398 | 398 | |
| 399 | -/* | |
| 400 | -** Return true if the prefix of zStr matches zPattern. Return false if | |
| 401 | -** they are different. | |
| 402 | -** | |
| 403 | -** A lowercase character in zPattern will match either upper or lower | |
| 404 | -** case in zStr. But an uppercase in zPattern will only match an | |
| 405 | -** uppercase in zStr. | |
| 406 | -*/ | |
| 407 | -static int prefix_match(const char *zPattern, const char *zStr){ | |
| 408 | - int i; | |
| 409 | - char c; | |
| 410 | - for(i=0; (c = zPattern[i])!=0; i++){ | |
| 411 | - if( zStr[i]!=c && fossil_tolower(zStr[i])!=c ) return 0; | |
| 412 | - } | |
| 413 | - return 1; | |
| 414 | -} | |
| 415 | - | |
| 416 | 399 | /* |
| 417 | 400 | ** Look at the HTTP_USER_AGENT parameter and try to determine if the user agent |
| 418 | 401 | ** is a manually operated browser or a bot. When in doubt, assume a bot. |
| 419 | 402 | ** Return true if we believe the agent is a real person. |
| 420 | 403 | */ |
| 421 | 404 | static int isHuman(const char *zAgent){ |
| 422 | - int i; | |
| 423 | 405 | if( zAgent==0 ) return 0; /* If no UserAgent, then probably a bot */ |
| 424 | - for(i=0; zAgent[i]; i++){ | |
| 425 | - if( prefix_match("bot", zAgent+i) ) return 0; | |
| 426 | - if( prefix_match("spider", zAgent+i) ) return 0; | |
| 427 | - if( prefix_match("crawl", zAgent+i) ) return 0; | |
| 428 | - /* If a URI appears in the User-Agent, it is probably a bot */ | |
| 429 | - if( strncmp("http", zAgent+i,4)==0 ) return 0; | |
| 430 | - } | |
| 406 | + if( strstr(zAgent, "bot")!=0 ) return 0; | |
| 407 | + if( strstr(zAgent, "spider")!=0 ) return 0; | |
| 408 | + if( strstr(zAgent, "crawl")!=0 ) return 0; | |
| 409 | + /* If a URI appears in the User-Agent, it is probably a bot */ | |
| 410 | + if( strstr(zAgent, "http")!=0 ) return 0; | |
| 431 | 411 | if( strncmp(zAgent, "Mozilla/", 8)==0 ){ |
| 432 | 412 | if( atoi(&zAgent[8])<4 ) return 0; /* Many bots advertise as Mozilla/3 */ |
| 433 | 413 | |
| 414 | + /* Google AI Robot, maybe? */ | |
| 415 | + if( strstr(zAgent, "GoogleOther)")!=0 ) return 0; | |
| 416 | + | |
| 434 | 417 | /* 2016-05-30: A pernicious spider that likes to walk Fossil timelines has |
| 435 | 418 | ** been detected on the SQLite website. The spider changes its user-agent |
| 436 | 419 | ** string frequently, but it always seems to include the following text: |
| 437 | 420 | */ |
| 438 | - if( sqlite3_strglob("*Safari/537.36Mozilla/5.0*", zAgent)==0 ) return 0; | |
| 421 | + if( strstr(zAgent, "Safari/537.36Mozilla/5.0")!=0 ) return 0; | |
| 439 | 422 | |
| 440 | 423 | if( sqlite3_strglob("*Firefox/[1-9]*", zAgent)==0 ) return 1; |
| 441 | 424 | if( sqlite3_strglob("*Chrome/[1-9]*", zAgent)==0 ) return 1; |
| 442 | 425 | if( sqlite3_strglob("*(compatible;?MSIE?[1789]*", zAgent)==0 ) return 1; |
| 443 | 426 | if( sqlite3_strglob("*Trident/[1-9]*;?rv:[1-9]*", zAgent)==0 ){ |
| 444 | 427 |
| --- src/login.c | |
| +++ src/login.c | |
| @@ -394,50 +394,33 @@ | |
| 394 | cgi_replace_parameter(cookie, NULL); |
| 395 | cgi_replace_parameter("anon", NULL); |
| 396 | } |
| 397 | } |
| 398 | |
| 399 | /* |
| 400 | ** Return true if the prefix of zStr matches zPattern. Return false if |
| 401 | ** they are different. |
| 402 | ** |
| 403 | ** A lowercase character in zPattern will match either upper or lower |
| 404 | ** case in zStr. But an uppercase in zPattern will only match an |
| 405 | ** uppercase in zStr. |
| 406 | */ |
| 407 | static int prefix_match(const char *zPattern, const char *zStr){ |
| 408 | int i; |
| 409 | char c; |
| 410 | for(i=0; (c = zPattern[i])!=0; i++){ |
| 411 | if( zStr[i]!=c && fossil_tolower(zStr[i])!=c ) return 0; |
| 412 | } |
| 413 | return 1; |
| 414 | } |
| 415 | |
| 416 | /* |
| 417 | ** Look at the HTTP_USER_AGENT parameter and try to determine if the user agent |
| 418 | ** is a manually operated browser or a bot. When in doubt, assume a bot. |
| 419 | ** Return true if we believe the agent is a real person. |
| 420 | */ |
| 421 | static int isHuman(const char *zAgent){ |
| 422 | int i; |
| 423 | if( zAgent==0 ) return 0; /* If no UserAgent, then probably a bot */ |
| 424 | for(i=0; zAgent[i]; i++){ |
| 425 | if( prefix_match("bot", zAgent+i) ) return 0; |
| 426 | if( prefix_match("spider", zAgent+i) ) return 0; |
| 427 | if( prefix_match("crawl", zAgent+i) ) return 0; |
| 428 | /* If a URI appears in the User-Agent, it is probably a bot */ |
| 429 | if( strncmp("http", zAgent+i,4)==0 ) return 0; |
| 430 | } |
| 431 | if( strncmp(zAgent, "Mozilla/", 8)==0 ){ |
| 432 | if( atoi(&zAgent[8])<4 ) return 0; /* Many bots advertise as Mozilla/3 */ |
| 433 | |
| 434 | /* 2016-05-30: A pernicious spider that likes to walk Fossil timelines has |
| 435 | ** been detected on the SQLite website. The spider changes its user-agent |
| 436 | ** string frequently, but it always seems to include the following text: |
| 437 | */ |
| 438 | if( sqlite3_strglob("*Safari/537.36Mozilla/5.0*", zAgent)==0 ) return 0; |
| 439 | |
| 440 | if( sqlite3_strglob("*Firefox/[1-9]*", zAgent)==0 ) return 1; |
| 441 | if( sqlite3_strglob("*Chrome/[1-9]*", zAgent)==0 ) return 1; |
| 442 | if( sqlite3_strglob("*(compatible;?MSIE?[1789]*", zAgent)==0 ) return 1; |
| 443 | if( sqlite3_strglob("*Trident/[1-9]*;?rv:[1-9]*", zAgent)==0 ){ |
| 444 |
| --- src/login.c | |
| +++ src/login.c | |
| @@ -394,50 +394,33 @@ | |
| 394 | cgi_replace_parameter(cookie, NULL); |
| 395 | cgi_replace_parameter("anon", NULL); |
| 396 | } |
| 397 | } |
| 398 | |
| 399 | /* |
| 400 | ** Look at the HTTP_USER_AGENT parameter and try to determine if the user agent |
| 401 | ** is a manually operated browser or a bot. When in doubt, assume a bot. |
| 402 | ** Return true if we believe the agent is a real person. |
| 403 | */ |
| 404 | static int isHuman(const char *zAgent){ |
| 405 | if( zAgent==0 ) return 0; /* If no UserAgent, then probably a bot */ |
| 406 | if( strstr(zAgent, "bot")!=0 ) return 0; |
| 407 | if( strstr(zAgent, "spider")!=0 ) return 0; |
| 408 | if( strstr(zAgent, "crawl")!=0 ) return 0; |
| 409 | /* If a URI appears in the User-Agent, it is probably a bot */ |
| 410 | if( strstr(zAgent, "http")!=0 ) return 0; |
| 411 | if( strncmp(zAgent, "Mozilla/", 8)==0 ){ |
| 412 | if( atoi(&zAgent[8])<4 ) return 0; /* Many bots advertise as Mozilla/3 */ |
| 413 | |
| 414 | /* Google AI Robot, maybe? */ |
| 415 | if( strstr(zAgent, "GoogleOther)")!=0 ) return 0; |
| 416 | |
| 417 | /* 2016-05-30: A pernicious spider that likes to walk Fossil timelines has |
| 418 | ** been detected on the SQLite website. The spider changes its user-agent |
| 419 | ** string frequently, but it always seems to include the following text: |
| 420 | */ |
| 421 | if( strstr(zAgent, "Safari/537.36Mozilla/5.0")!=0 ) return 0; |
| 422 | |
| 423 | if( sqlite3_strglob("*Firefox/[1-9]*", zAgent)==0 ) return 1; |
| 424 | if( sqlite3_strglob("*Chrome/[1-9]*", zAgent)==0 ) return 1; |
| 425 | if( sqlite3_strglob("*(compatible;?MSIE?[1789]*", zAgent)==0 ) return 1; |
| 426 | if( sqlite3_strglob("*Trident/[1-9]*;?rv:[1-9]*", zAgent)==0 ){ |
| 427 |