Fossil SCM

Enhancements to the UserAgent bot recognizer. We discovered earlier today on the Fossil server itself that it is very important not to misclassify bots as human since a spider that downloads every possible historical annotation and tarball and zip archive and diff can really load up a server and soak up a lot of bandwidth.

drh 2011-11-25 16:11 trunk
Commit 83284480a39ba70863834c7bbf1407968ad2cf5e
2 files changed +19 -7 +1
+19 -7
--- src/login.c
+++ src/login.c
@@ -355,23 +355,35 @@
355355
}
356356
}
357357
358358
/*
359359
** Look at the HTTP_USER_AGENT parameter and try to determine if the user agent
360
-** is a manually operated browser or a bot. When in doubt, assume a bot. Return
361
-** true if we believe the agent is a real person.
360
+** is a manually operated browser or a bot. When in doubt, assume a bot.
361
+** Return true if we believe the agent is a real person.
362362
*/
363363
static int isHuman(const char *zAgent){
364364
int i;
365
- if( zAgent==0 ) return 0;
365
+ int seenCompatible = 0;
366
+ int seenIE = 0;
367
+ if( zAgent==0 ) return 0; /* If not UserAgent, the probably a bot */
366368
for(i=0; zAgent[i]; i++){
367
- if( zAgent[i]=='b' && memcmp(&zAgent[i],"bot",3)==0 ) return 0;
368
- if( zAgent[i]=='s' && memcmp(&zAgent[i],"spider",6)==0 ) return 0;
369
- if( zAgent[i]=='r' && memcmp(&zAgent[i],"rawl",4)==0 ) return 0;
369
+ char c = zAgent[i];
370
+ if( c=='b' && memcmp(&zAgent[i],"bot",3)==0 ) return 0;
371
+ if( c=='s' && memcmp(&zAgent[i],"spider",6)==0 ) return 0;
372
+ if( c=='r' && memcmp(&zAgent[i],"rawl",4)==0 ) return 0; /* "crawler" */
373
+ /* Anything that puts a URL in the UserAgent string is probably a bot */
374
+ if( c=='h' && memcmp(&zAgent[i],"http",4)==0 ) return 0;
375
+ if( c=='c' && memcmp(&zAgent[i],"compatible",11)==0 ){
376
+ seenCompatible = 1;
377
+ i+=10;
378
+ }
379
+ if( c=='I' && zAgent[i+1]=='E' ) seenIE = 1;
370380
}
371381
if( memcmp(zAgent, "Mozilla/", 8)==0 ){
372
- return atoi(&zAgent[8])>=4;
382
+ if( atoi(&zAgent[8])<4 ) return 0; /* Many bots advertise as Mozilla/3 */
383
+ if( seenCompatible && !seenIE ) return 0;
384
+ return 1;
373385
}
374386
if( memcmp(zAgent, "Opera/", 6)==0 ) return 1;
375387
if( memcmp(zAgent, "Safari/", 7)==0 ) return 1;
376388
if( memcmp(zAgent, "Lynx/", 5)==0 ) return 1;
377389
return 0;
378390
--- src/login.c
+++ src/login.c
@@ -355,23 +355,35 @@
355 }
356 }
357
358 /*
359 ** Look at the HTTP_USER_AGENT parameter and try to determine if the user agent
360 ** is a manually operated browser or a bot. When in doubt, assume a bot. Return
361 ** true if we believe the agent is a real person.
362 */
363 static int isHuman(const char *zAgent){
364 int i;
365 if( zAgent==0 ) return 0;
 
 
366 for(i=0; zAgent[i]; i++){
367 if( zAgent[i]=='b' && memcmp(&zAgent[i],"bot",3)==0 ) return 0;
368 if( zAgent[i]=='s' && memcmp(&zAgent[i],"spider",6)==0 ) return 0;
369 if( zAgent[i]=='r' && memcmp(&zAgent[i],"rawl",4)==0 ) return 0;
 
 
 
 
 
 
 
 
370 }
371 if( memcmp(zAgent, "Mozilla/", 8)==0 ){
372 return atoi(&zAgent[8])>=4;
 
 
373 }
374 if( memcmp(zAgent, "Opera/", 6)==0 ) return 1;
375 if( memcmp(zAgent, "Safari/", 7)==0 ) return 1;
376 if( memcmp(zAgent, "Lynx/", 5)==0 ) return 1;
377 return 0;
378
--- src/login.c
+++ src/login.c
@@ -355,23 +355,35 @@
355 }
356 }
357
358 /*
359 ** Look at the HTTP_USER_AGENT parameter and try to determine if the user agent
360 ** is a manually operated browser or a bot. When in doubt, assume a bot.
361 ** Return true if we believe the agent is a real person.
362 */
363 static int isHuman(const char *zAgent){
364 int i;
365 int seenCompatible = 0;
366 int seenIE = 0;
367 if( zAgent==0 ) return 0; /* If not UserAgent, the probably a bot */
368 for(i=0; zAgent[i]; i++){
369 char c = zAgent[i];
370 if( c=='b' && memcmp(&zAgent[i],"bot",3)==0 ) return 0;
371 if( c=='s' && memcmp(&zAgent[i],"spider",6)==0 ) return 0;
372 if( c=='r' && memcmp(&zAgent[i],"rawl",4)==0 ) return 0; /* "crawler" */
373 /* Anything that puts a URL in the UserAgent string is probably a bot */
374 if( c=='h' && memcmp(&zAgent[i],"http",4)==0 ) return 0;
375 if( c=='c' && memcmp(&zAgent[i],"compatible",11)==0 ){
376 seenCompatible = 1;
377 i+=10;
378 }
379 if( c=='I' && zAgent[i+1]=='E' ) seenIE = 1;
380 }
381 if( memcmp(zAgent, "Mozilla/", 8)==0 ){
382 if( atoi(&zAgent[8])<4 ) return 0; /* Many bots advertise as Mozilla/3 */
383 if( seenCompatible && !seenIE ) return 0;
384 return 1;
385 }
386 if( memcmp(zAgent, "Opera/", 6)==0 ) return 1;
387 if( memcmp(zAgent, "Safari/", 7)==0 ) return 1;
388 if( memcmp(zAgent, "Lynx/", 5)==0 ) return 1;
389 return 0;
390
--- src/style.c
+++ src/style.c
@@ -893,10 +893,11 @@
893893
zCap[i] = 0;
894894
@ g.userUid = %d(g.userUid)<br />
895895
@ g.zLogin = %h(g.zLogin)<br />
896896
@ capabilities = %s(zCap)<br />
897897
@ <hr>
898
+ P("HTTP_USER_AGENT");
898899
cgi_print_all(atoi(PD("showall","0")));
899900
if( g.perm.Setup ){
900901
const char *zRedir = P("redirect");
901902
if( zRedir ) cgi_redirect(zRedir);
902903
}
903904
--- src/style.c
+++ src/style.c
@@ -893,10 +893,11 @@
893 zCap[i] = 0;
894 @ g.userUid = %d(g.userUid)<br />
895 @ g.zLogin = %h(g.zLogin)<br />
896 @ capabilities = %s(zCap)<br />
897 @ <hr>
 
898 cgi_print_all(atoi(PD("showall","0")));
899 if( g.perm.Setup ){
900 const char *zRedir = P("redirect");
901 if( zRedir ) cgi_redirect(zRedir);
902 }
903
--- src/style.c
+++ src/style.c
@@ -893,10 +893,11 @@
893 zCap[i] = 0;
894 @ g.userUid = %d(g.userUid)<br />
895 @ g.zLogin = %h(g.zLogin)<br />
896 @ capabilities = %s(zCap)<br />
897 @ <hr>
898 P("HTTP_USER_AGENT");
899 cgi_print_all(atoi(PD("showall","0")));
900 if( g.perm.Setup ){
901 const char *zRedir = P("redirect");
902 if( zRedir ) cgi_redirect(zRedir);
903 }
904

Keyboard Shortcuts

Open search /
Next entry (timeline) j
Previous entry (timeline) k
Open focused entry Enter
Show this help ?
Toggle theme Top nav button