Fossil SCM
Enhancements to the UserAgent bot recognizer. We discovered earlier today on the Fossil server itself that it is very important not to misclassify bots as human since a spider that downloads every possible historical annotation and tarball and zip archive and diff can really load up a server and soak up a lot of bandwidth.
Commit
83284480a39ba70863834c7bbf1407968ad2cf5e
Parent
fe075f5d89f611b…
2 files changed
+19
-7
+1
+19
-7
| --- src/login.c | ||
| +++ src/login.c | ||
| @@ -355,23 +355,35 @@ | ||
| 355 | 355 | } |
| 356 | 356 | } |
| 357 | 357 | |
| 358 | 358 | /* |
| 359 | 359 | ** Look at the HTTP_USER_AGENT parameter and try to determine if the user agent |
| 360 | -** is a manually operated browser or a bot. When in doubt, assume a bot. Return | |
| 361 | -** true if we believe the agent is a real person. | |
| 360 | +** is a manually operated browser or a bot. When in doubt, assume a bot. | |
| 361 | +** Return true if we believe the agent is a real person. | |
| 362 | 362 | */ |
| 363 | 363 | static int isHuman(const char *zAgent){ |
| 364 | 364 | int i; |
| 365 | - if( zAgent==0 ) return 0; | |
| 365 | + int seenCompatible = 0; | |
| 366 | + int seenIE = 0; | |
| 367 | + if( zAgent==0 ) return 0; /* If not UserAgent, the probably a bot */ | |
| 366 | 368 | for(i=0; zAgent[i]; i++){ |
| 367 | - if( zAgent[i]=='b' && memcmp(&zAgent[i],"bot",3)==0 ) return 0; | |
| 368 | - if( zAgent[i]=='s' && memcmp(&zAgent[i],"spider",6)==0 ) return 0; | |
| 369 | - if( zAgent[i]=='r' && memcmp(&zAgent[i],"rawl",4)==0 ) return 0; | |
| 369 | + char c = zAgent[i]; | |
| 370 | + if( c=='b' && memcmp(&zAgent[i],"bot",3)==0 ) return 0; | |
| 371 | + if( c=='s' && memcmp(&zAgent[i],"spider",6)==0 ) return 0; | |
| 372 | + if( c=='r' && memcmp(&zAgent[i],"rawl",4)==0 ) return 0; /* "crawler" */ | |
| 373 | + /* Anything that puts a URL in the UserAgent string is probably a bot */ | |
| 374 | + if( c=='h' && memcmp(&zAgent[i],"http",4)==0 ) return 0; | |
| 375 | + if( c=='c' && memcmp(&zAgent[i],"compatible",11)==0 ){ | |
| 376 | + seenCompatible = 1; | |
| 377 | + i+=10; | |
| 378 | + } | |
| 379 | + if( c=='I' && zAgent[i+1]=='E' ) seenIE = 1; | |
| 370 | 380 | } |
| 371 | 381 | if( memcmp(zAgent, "Mozilla/", 8)==0 ){ |
| 372 | - return atoi(&zAgent[8])>=4; | |
| 382 | + if( atoi(&zAgent[8])<4 ) return 0; /* Many bots advertise as Mozilla/3 */ | |
| 383 | + if( seenCompatible && !seenIE ) return 0; | |
| 384 | + return 1; | |
| 373 | 385 | } |
| 374 | 386 | if( memcmp(zAgent, "Opera/", 6)==0 ) return 1; |
| 375 | 387 | if( memcmp(zAgent, "Safari/", 7)==0 ) return 1; |
| 376 | 388 | if( memcmp(zAgent, "Lynx/", 5)==0 ) return 1; |
| 377 | 389 | return 0; |
| 378 | 390 |
| --- src/login.c | |
| +++ src/login.c | |
| @@ -355,23 +355,35 @@ | |
| 355 | } |
| 356 | } |
| 357 | |
| 358 | /* |
| 359 | ** Look at the HTTP_USER_AGENT parameter and try to determine if the user agent |
| 360 | ** is a manually operated browser or a bot. When in doubt, assume a bot. Return |
| 361 | ** true if we believe the agent is a real person. |
| 362 | */ |
| 363 | static int isHuman(const char *zAgent){ |
| 364 | int i; |
| 365 | if( zAgent==0 ) return 0; |
| 366 | for(i=0; zAgent[i]; i++){ |
| 367 | if( zAgent[i]=='b' && memcmp(&zAgent[i],"bot",3)==0 ) return 0; |
| 368 | if( zAgent[i]=='s' && memcmp(&zAgent[i],"spider",6)==0 ) return 0; |
| 369 | if( zAgent[i]=='r' && memcmp(&zAgent[i],"rawl",4)==0 ) return 0; |
| 370 | } |
| 371 | if( memcmp(zAgent, "Mozilla/", 8)==0 ){ |
| 372 | return atoi(&zAgent[8])>=4; |
| 373 | } |
| 374 | if( memcmp(zAgent, "Opera/", 6)==0 ) return 1; |
| 375 | if( memcmp(zAgent, "Safari/", 7)==0 ) return 1; |
| 376 | if( memcmp(zAgent, "Lynx/", 5)==0 ) return 1; |
| 377 | return 0; |
| 378 |
| --- src/login.c | |
| +++ src/login.c | |
| @@ -355,23 +355,35 @@ | |
| 355 | } |
| 356 | } |
| 357 | |
| 358 | /* |
| 359 | ** Look at the HTTP_USER_AGENT parameter and try to determine if the user agent |
| 360 | ** is a manually operated browser or a bot. When in doubt, assume a bot. |
| 361 | ** Return true if we believe the agent is a real person. |
| 362 | */ |
| 363 | static int isHuman(const char *zAgent){ |
| 364 | int i; |
| 365 | int seenCompatible = 0; |
| 366 | int seenIE = 0; |
| 367 | if( zAgent==0 ) return 0; /* If not UserAgent, the probably a bot */ |
| 368 | for(i=0; zAgent[i]; i++){ |
| 369 | char c = zAgent[i]; |
| 370 | if( c=='b' && memcmp(&zAgent[i],"bot",3)==0 ) return 0; |
| 371 | if( c=='s' && memcmp(&zAgent[i],"spider",6)==0 ) return 0; |
| 372 | if( c=='r' && memcmp(&zAgent[i],"rawl",4)==0 ) return 0; /* "crawler" */ |
| 373 | /* Anything that puts a URL in the UserAgent string is probably a bot */ |
| 374 | if( c=='h' && memcmp(&zAgent[i],"http",4)==0 ) return 0; |
| 375 | if( c=='c' && memcmp(&zAgent[i],"compatible",11)==0 ){ |
| 376 | seenCompatible = 1; |
| 377 | i+=10; |
| 378 | } |
| 379 | if( c=='I' && zAgent[i+1]=='E' ) seenIE = 1; |
| 380 | } |
| 381 | if( memcmp(zAgent, "Mozilla/", 8)==0 ){ |
| 382 | if( atoi(&zAgent[8])<4 ) return 0; /* Many bots advertise as Mozilla/3 */ |
| 383 | if( seenCompatible && !seenIE ) return 0; |
| 384 | return 1; |
| 385 | } |
| 386 | if( memcmp(zAgent, "Opera/", 6)==0 ) return 1; |
| 387 | if( memcmp(zAgent, "Safari/", 7)==0 ) return 1; |
| 388 | if( memcmp(zAgent, "Lynx/", 5)==0 ) return 1; |
| 389 | return 0; |
| 390 |
+1
| --- src/style.c | ||
| +++ src/style.c | ||
| @@ -893,10 +893,11 @@ | ||
| 893 | 893 | zCap[i] = 0; |
| 894 | 894 | @ g.userUid = %d(g.userUid)<br /> |
| 895 | 895 | @ g.zLogin = %h(g.zLogin)<br /> |
| 896 | 896 | @ capabilities = %s(zCap)<br /> |
| 897 | 897 | @ <hr> |
| 898 | + P("HTTP_USER_AGENT"); | |
| 898 | 899 | cgi_print_all(atoi(PD("showall","0"))); |
| 899 | 900 | if( g.perm.Setup ){ |
| 900 | 901 | const char *zRedir = P("redirect"); |
| 901 | 902 | if( zRedir ) cgi_redirect(zRedir); |
| 902 | 903 | } |
| 903 | 904 |
| --- src/style.c | |
| +++ src/style.c | |
| @@ -893,10 +893,11 @@ | |
| 893 | zCap[i] = 0; |
| 894 | @ g.userUid = %d(g.userUid)<br /> |
| 895 | @ g.zLogin = %h(g.zLogin)<br /> |
| 896 | @ capabilities = %s(zCap)<br /> |
| 897 | @ <hr> |
| 898 | cgi_print_all(atoi(PD("showall","0"))); |
| 899 | if( g.perm.Setup ){ |
| 900 | const char *zRedir = P("redirect"); |
| 901 | if( zRedir ) cgi_redirect(zRedir); |
| 902 | } |
| 903 |
| --- src/style.c | |
| +++ src/style.c | |
| @@ -893,10 +893,11 @@ | |
| 893 | zCap[i] = 0; |
| 894 | @ g.userUid = %d(g.userUid)<br /> |
| 895 | @ g.zLogin = %h(g.zLogin)<br /> |
| 896 | @ capabilities = %s(zCap)<br /> |
| 897 | @ <hr> |
| 898 | P("HTTP_USER_AGENT"); |
| 899 | cgi_print_all(atoi(PD("showall","0"))); |
| 900 | if( g.perm.Setup ){ |
| 901 | const char *zRedir = P("redirect"); |
| 902 | if( zRedir ) cgi_redirect(zRedir); |
| 903 | } |
| 904 |