Fossil SCM
Further refinement of the robot/human discriminator.
Commit
4fdb63d690183255bc3101cbea413289dc8416a0
Parent
82979bc21073197…
2 files changed
+1
-1
+27
-19
+1
-1
| --- src/http.c | ||
| +++ src/http.c | ||
| @@ -111,11 +111,11 @@ | ||
| 111 | 111 | fossil_free(zEncoded); |
| 112 | 112 | fossil_free(zCredentials); |
| 113 | 113 | } |
| 114 | 114 | blob_appendf(pHdr, "Host: %s\r\n", g.urlHostname); |
| 115 | 115 | blob_appendf(pHdr, "User-Agent: Fossil/" RELEASE_VERSION |
| 116 | - "-" MANIFEST_VERSION "\r\n"); | |
| 116 | + " (" MANIFEST_DATE " " MANIFEST_VERSION ")\r\n"); | |
| 117 | 117 | if( g.fHttpTrace ){ |
| 118 | 118 | blob_appendf(pHdr, "Content-Type: application/x-fossil-debug\r\n"); |
| 119 | 119 | }else{ |
| 120 | 120 | blob_appendf(pHdr, "Content-Type: application/x-fossil\r\n"); |
| 121 | 121 | } |
| 122 | 122 |
| --- src/http.c | |
| +++ src/http.c | |
| @@ -111,11 +111,11 @@ | |
| 111 | fossil_free(zEncoded); |
| 112 | fossil_free(zCredentials); |
| 113 | } |
| 114 | blob_appendf(pHdr, "Host: %s\r\n", g.urlHostname); |
| 115 | blob_appendf(pHdr, "User-Agent: Fossil/" RELEASE_VERSION |
| 116 | "-" MANIFEST_VERSION "\r\n"); |
| 117 | if( g.fHttpTrace ){ |
| 118 | blob_appendf(pHdr, "Content-Type: application/x-fossil-debug\r\n"); |
| 119 | }else{ |
| 120 | blob_appendf(pHdr, "Content-Type: application/x-fossil\r\n"); |
| 121 | } |
| 122 |
| --- src/http.c | |
| +++ src/http.c | |
| @@ -111,11 +111,11 @@ | |
| 111 | fossil_free(zEncoded); |
| 112 | fossil_free(zCredentials); |
| 113 | } |
| 114 | blob_appendf(pHdr, "Host: %s\r\n", g.urlHostname); |
| 115 | blob_appendf(pHdr, "User-Agent: Fossil/" RELEASE_VERSION |
| 116 | " (" MANIFEST_DATE " " MANIFEST_VERSION ")\r\n"); |
| 117 | if( g.fHttpTrace ){ |
| 118 | blob_appendf(pHdr, "Content-Type: application/x-fossil-debug\r\n"); |
| 119 | }else{ |
| 120 | blob_appendf(pHdr, "Content-Type: application/x-fossil\r\n"); |
| 121 | } |
| 122 |
+27
-19
| --- src/login.c | ||
| +++ src/login.c | ||
| @@ -352,42 +352,50 @@ | ||
| 352 | 352 | ** downstream problems here. We could alternately use "" here. |
| 353 | 353 | */ |
| 354 | 354 | ; |
| 355 | 355 | } |
| 356 | 356 | } |
| 357 | + | |
| 358 | +/* | |
| 359 | +** Return true if the prefix of zStr matches zPattern. Return false if | |
| 360 | +** they are different. | |
| 361 | +** | |
| 362 | +** A lowercase character in zPattern will match either upper or lower | |
| 363 | +** case in zStr. But an uppercase in zPattern will only match an | |
| 364 | +** uppercase in zStr. | |
| 365 | +*/ | |
| 366 | +static int prefix_match(const char *zPattern, const char *zStr){ | |
| 367 | + int i; | |
| 368 | + char c; | |
| 369 | + for(i=0; (c = zPattern[i])!=0; i++){ | |
| 370 | + if( zStr[i]!=c && fossil_tolower(zStr[i])!=c ) return 0; | |
| 371 | + } | |
| 372 | + return 1; | |
| 373 | +} | |
| 357 | 374 | |
| 358 | 375 | /* |
| 359 | 376 | ** Look at the HTTP_USER_AGENT parameter and try to determine if the user agent |
| 360 | 377 | ** is a manually operated browser or a bot. When in doubt, assume a bot. |
| 361 | 378 | ** Return true if we believe the agent is a real person. |
| 362 | 379 | */ |
| 363 | 380 | static int isHuman(const char *zAgent){ |
| 364 | 381 | int i; |
| 365 | - int seenCompatible = 0; | |
| 366 | 382 | if( zAgent==0 ) return 0; /* If not UserAgent, the probably a bot */ |
| 367 | 383 | for(i=0; zAgent[i]; i++){ |
| 368 | - char c = zAgent[i]; | |
| 369 | - if( c=='b' && memcmp(&zAgent[i],"bot",3)==0 ) return 0; | |
| 370 | - if( c=='p' && memcmp(&zAgent[i],"pider",5)==0 ) return 0; /* "spider" */ | |
| 371 | - if( c=='r' && memcmp(&zAgent[i],"rawl",4)==0 ) return 0; /* "crawler" */ | |
| 372 | - /* Anything that puts a URL in the UserAgent string is probably a bot */ | |
| 373 | - if( c=='h' && memcmp(&zAgent[i],"http",4)==0 ) return 0; | |
| 374 | - if( c=='c' && seenCompatible==0 && memcmp(&zAgent[i],"compatible",11)==0 ){ | |
| 375 | - seenCompatible = i; | |
| 376 | - i+=10; | |
| 377 | - } | |
| 384 | + if( prefix_match("bot", zAgent+i) ) return 0; | |
| 385 | + if( prefix_match("spider", zAgent+i) ) return 0; | |
| 386 | + if( prefix_match("crawl", zAgent+i) ) return 0; | |
| 387 | + /* If a URI appears in the User-Agent, it is probably a bot */ | |
| 388 | + if( memcmp("http", zAgent+i,4)==0 ) return 0; | |
| 378 | 389 | } |
| 379 | 390 | if( memcmp(zAgent, "Mozilla/", 8)==0 ){ |
| 380 | 391 | if( atoi(&zAgent[8])<4 ) return 0; /* Many bots advertise as Mozilla/3 */ |
| 381 | - if( seenCompatible | |
| 382 | - && memcmp(&zAgent[seenCompatible],"compatible;_MSIE_", 18)!=0 | |
| 383 | - ){ | |
| 384 | - /* If it claims to be Mozilla compatible and it isn't MSIE, then it | |
| 385 | - ** is probably a bot */ | |
| 386 | - return 0; | |
| 387 | - } | |
| 388 | - return 1; | |
| 392 | + if( strglob("*Firefox/[1-9]*", zAgent) ) return 1; | |
| 393 | + if( strglob("*Chrome/[1-9]*", zAgent) ) return 1; | |
| 394 | + if( strglob("*(compatible;?MSIE?[1-9]*", zAgent) ) return 1; | |
| 395 | + if( strglob("*AppleWebKit/[1-9]*(KHTML*", zAgent) ) return 1; | |
| 396 | + return 0; | |
| 389 | 397 | } |
| 390 | 398 | if( memcmp(zAgent, "Opera/", 6)==0 ) return 1; |
| 391 | 399 | if( memcmp(zAgent, "Safari/", 7)==0 ) return 1; |
| 392 | 400 | if( memcmp(zAgent, "Lynx/", 5)==0 ) return 1; |
| 393 | 401 | return 0; |
| 394 | 402 |
| --- src/login.c | |
| +++ src/login.c | |
| @@ -352,42 +352,50 @@ | |
| 352 | ** downstream problems here. We could alternately use "" here. |
| 353 | */ |
| 354 | ; |
| 355 | } |
| 356 | } |
| 357 | |
| 358 | /* |
| 359 | ** Look at the HTTP_USER_AGENT parameter and try to determine if the user agent |
| 360 | ** is a manually operated browser or a bot. When in doubt, assume a bot. |
| 361 | ** Return true if we believe the agent is a real person. |
| 362 | */ |
| 363 | static int isHuman(const char *zAgent){ |
| 364 | int i; |
| 365 | int seenCompatible = 0; |
| 366 | if( zAgent==0 ) return 0; /* If not UserAgent, the probably a bot */ |
| 367 | for(i=0; zAgent[i]; i++){ |
| 368 | char c = zAgent[i]; |
| 369 | if( c=='b' && memcmp(&zAgent[i],"bot",3)==0 ) return 0; |
| 370 | if( c=='p' && memcmp(&zAgent[i],"pider",5)==0 ) return 0; /* "spider" */ |
| 371 | if( c=='r' && memcmp(&zAgent[i],"rawl",4)==0 ) return 0; /* "crawler" */ |
| 372 | /* Anything that puts a URL in the UserAgent string is probably a bot */ |
| 373 | if( c=='h' && memcmp(&zAgent[i],"http",4)==0 ) return 0; |
| 374 | if( c=='c' && seenCompatible==0 && memcmp(&zAgent[i],"compatible",11)==0 ){ |
| 375 | seenCompatible = i; |
| 376 | i+=10; |
| 377 | } |
| 378 | } |
| 379 | if( memcmp(zAgent, "Mozilla/", 8)==0 ){ |
| 380 | if( atoi(&zAgent[8])<4 ) return 0; /* Many bots advertise as Mozilla/3 */ |
| 381 | if( seenCompatible |
| 382 | && memcmp(&zAgent[seenCompatible],"compatible;_MSIE_", 18)!=0 |
| 383 | ){ |
| 384 | /* If it claims to be Mozilla compatible and it isn't MSIE, then it |
| 385 | ** is probably a bot */ |
| 386 | return 0; |
| 387 | } |
| 388 | return 1; |
| 389 | } |
| 390 | if( memcmp(zAgent, "Opera/", 6)==0 ) return 1; |
| 391 | if( memcmp(zAgent, "Safari/", 7)==0 ) return 1; |
| 392 | if( memcmp(zAgent, "Lynx/", 5)==0 ) return 1; |
| 393 | return 0; |
| 394 |
| --- src/login.c | |
| +++ src/login.c | |
| @@ -352,42 +352,50 @@ | |
| 352 | ** downstream problems here. We could alternately use "" here. |
| 353 | */ |
| 354 | ; |
| 355 | } |
| 356 | } |
| 357 | |
| 358 | /* |
| 359 | ** Return true if the prefix of zStr matches zPattern. Return false if |
| 360 | ** they are different. |
| 361 | ** |
| 362 | ** A lowercase character in zPattern will match either upper or lower |
| 363 | ** case in zStr. But an uppercase in zPattern will only match an |
| 364 | ** uppercase in zStr. |
| 365 | */ |
| 366 | static int prefix_match(const char *zPattern, const char *zStr){ |
| 367 | int i; |
| 368 | char c; |
| 369 | for(i=0; (c = zPattern[i])!=0; i++){ |
| 370 | if( zStr[i]!=c && fossil_tolower(zStr[i])!=c ) return 0; |
| 371 | } |
| 372 | return 1; |
| 373 | } |
| 374 | |
| 375 | /* |
| 376 | ** Look at the HTTP_USER_AGENT parameter and try to determine if the user agent |
| 377 | ** is a manually operated browser or a bot. When in doubt, assume a bot. |
| 378 | ** Return true if we believe the agent is a real person. |
| 379 | */ |
| 380 | static int isHuman(const char *zAgent){ |
| 381 | int i; |
| 382 | if( zAgent==0 ) return 0; /* If not UserAgent, the probably a bot */ |
| 383 | for(i=0; zAgent[i]; i++){ |
| 384 | if( prefix_match("bot", zAgent+i) ) return 0; |
| 385 | if( prefix_match("spider", zAgent+i) ) return 0; |
| 386 | if( prefix_match("crawl", zAgent+i) ) return 0; |
| 387 | /* If a URI appears in the User-Agent, it is probably a bot */ |
| 388 | if( memcmp("http", zAgent+i,4)==0 ) return 0; |
| 389 | } |
| 390 | if( memcmp(zAgent, "Mozilla/", 8)==0 ){ |
| 391 | if( atoi(&zAgent[8])<4 ) return 0; /* Many bots advertise as Mozilla/3 */ |
| 392 | if( strglob("*Firefox/[1-9]*", zAgent) ) return 1; |
| 393 | if( strglob("*Chrome/[1-9]*", zAgent) ) return 1; |
| 394 | if( strglob("*(compatible;?MSIE?[1-9]*", zAgent) ) return 1; |
| 395 | if( strglob("*AppleWebKit/[1-9]*(KHTML*", zAgent) ) return 1; |
| 396 | return 0; |
| 397 | } |
| 398 | if( memcmp(zAgent, "Opera/", 6)==0 ) return 1; |
| 399 | if( memcmp(zAgent, "Safari/", 7)==0 ) return 1; |
| 400 | if( memcmp(zAgent, "Lynx/", 5)==0 ) return 1; |
| 401 | return 0; |
| 402 |