Fossil SCM

Further refinement of the robot/human discriminator.

drh 2011-11-26 15:09 trunk
Commit 4fdb63d690183255bc3101cbea413289dc8416a0
2 files changed +1 -1 +27 -19
+1 -1
--- src/http.c
+++ src/http.c
@@ -111,11 +111,11 @@
111111
fossil_free(zEncoded);
112112
fossil_free(zCredentials);
113113
}
114114
blob_appendf(pHdr, "Host: %s\r\n", g.urlHostname);
115115
blob_appendf(pHdr, "User-Agent: Fossil/" RELEASE_VERSION
116
- "-" MANIFEST_VERSION "\r\n");
116
+ " (" MANIFEST_DATE " " MANIFEST_VERSION ")\r\n");
117117
if( g.fHttpTrace ){
118118
blob_appendf(pHdr, "Content-Type: application/x-fossil-debug\r\n");
119119
}else{
120120
blob_appendf(pHdr, "Content-Type: application/x-fossil\r\n");
121121
}
122122
--- src/http.c
+++ src/http.c
@@ -111,11 +111,11 @@
111 fossil_free(zEncoded);
112 fossil_free(zCredentials);
113 }
114 blob_appendf(pHdr, "Host: %s\r\n", g.urlHostname);
115 blob_appendf(pHdr, "User-Agent: Fossil/" RELEASE_VERSION
116 "-" MANIFEST_VERSION "\r\n");
117 if( g.fHttpTrace ){
118 blob_appendf(pHdr, "Content-Type: application/x-fossil-debug\r\n");
119 }else{
120 blob_appendf(pHdr, "Content-Type: application/x-fossil\r\n");
121 }
122
--- src/http.c
+++ src/http.c
@@ -111,11 +111,11 @@
111 fossil_free(zEncoded);
112 fossil_free(zCredentials);
113 }
114 blob_appendf(pHdr, "Host: %s\r\n", g.urlHostname);
115 blob_appendf(pHdr, "User-Agent: Fossil/" RELEASE_VERSION
116 " (" MANIFEST_DATE " " MANIFEST_VERSION ")\r\n");
117 if( g.fHttpTrace ){
118 blob_appendf(pHdr, "Content-Type: application/x-fossil-debug\r\n");
119 }else{
120 blob_appendf(pHdr, "Content-Type: application/x-fossil\r\n");
121 }
122
+27 -19
--- src/login.c
+++ src/login.c
@@ -352,42 +352,50 @@
352352
** downstream problems here. We could alternately use "" here.
353353
*/
354354
;
355355
}
356356
}
357
+
358
+/*
359
+** Return true if the prefix of zStr matches zPattern. Return false if
360
+** they are different.
361
+**
362
+** A lowercase character in zPattern will match either upper or lower
363
+** case in zStr. But an uppercase in zPattern will only match an
364
+** uppercase in zStr.
365
+*/
366
+static int prefix_match(const char *zPattern, const char *zStr){
367
+ int i;
368
+ char c;
369
+ for(i=0; (c = zPattern[i])!=0; i++){
370
+ if( zStr[i]!=c && fossil_tolower(zStr[i])!=c ) return 0;
371
+ }
372
+ return 1;
373
+}
357374
358375
/*
359376
** Look at the HTTP_USER_AGENT parameter and try to determine if the user agent
360377
** is a manually operated browser or a bot. When in doubt, assume a bot.
361378
** Return true if we believe the agent is a real person.
362379
*/
363380
static int isHuman(const char *zAgent){
364381
int i;
365
- int seenCompatible = 0;
366382
if( zAgent==0 ) return 0; /* If not UserAgent, the probably a bot */
367383
for(i=0; zAgent[i]; i++){
368
- char c = zAgent[i];
369
- if( c=='b' && memcmp(&zAgent[i],"bot",3)==0 ) return 0;
370
- if( c=='p' && memcmp(&zAgent[i],"pider",5)==0 ) return 0; /* "spider" */
371
- if( c=='r' && memcmp(&zAgent[i],"rawl",4)==0 ) return 0; /* "crawler" */
372
- /* Anything that puts a URL in the UserAgent string is probably a bot */
373
- if( c=='h' && memcmp(&zAgent[i],"http",4)==0 ) return 0;
374
- if( c=='c' && seenCompatible==0 && memcmp(&zAgent[i],"compatible",11)==0 ){
375
- seenCompatible = i;
376
- i+=10;
377
- }
384
+ if( prefix_match("bot", zAgent+i) ) return 0;
385
+ if( prefix_match("spider", zAgent+i) ) return 0;
386
+ if( prefix_match("crawl", zAgent+i) ) return 0;
387
+ /* If a URI appears in the User-Agent, it is probably a bot */
388
+ if( memcmp("http", zAgent+i,4)==0 ) return 0;
378389
}
379390
if( memcmp(zAgent, "Mozilla/", 8)==0 ){
380391
if( atoi(&zAgent[8])<4 ) return 0; /* Many bots advertise as Mozilla/3 */
381
- if( seenCompatible
382
- && memcmp(&zAgent[seenCompatible],"compatible;_MSIE_", 18)!=0
383
- ){
384
- /* If it claims to be Mozilla compatible and it isn't MSIE, then it
385
- ** is probably a bot */
386
- return 0;
387
- }
388
- return 1;
392
+ if( strglob("*Firefox/[1-9]*", zAgent) ) return 1;
393
+ if( strglob("*Chrome/[1-9]*", zAgent) ) return 1;
394
+ if( strglob("*(compatible;?MSIE?[1-9]*", zAgent) ) return 1;
395
+ if( strglob("*AppleWebKit/[1-9]*(KHTML*", zAgent) ) return 1;
396
+ return 0;
389397
}
390398
if( memcmp(zAgent, "Opera/", 6)==0 ) return 1;
391399
if( memcmp(zAgent, "Safari/", 7)==0 ) return 1;
392400
if( memcmp(zAgent, "Lynx/", 5)==0 ) return 1;
393401
return 0;
394402
--- src/login.c
+++ src/login.c
@@ -352,42 +352,50 @@
352 ** downstream problems here. We could alternately use "" here.
353 */
354 ;
355 }
356 }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
357
358 /*
359 ** Look at the HTTP_USER_AGENT parameter and try to determine if the user agent
360 ** is a manually operated browser or a bot. When in doubt, assume a bot.
361 ** Return true if we believe the agent is a real person.
362 */
363 static int isHuman(const char *zAgent){
364 int i;
365 int seenCompatible = 0;
366 if( zAgent==0 ) return 0; /* If not UserAgent, the probably a bot */
367 for(i=0; zAgent[i]; i++){
368 char c = zAgent[i];
369 if( c=='b' && memcmp(&zAgent[i],"bot",3)==0 ) return 0;
370 if( c=='p' && memcmp(&zAgent[i],"pider",5)==0 ) return 0; /* "spider" */
371 if( c=='r' && memcmp(&zAgent[i],"rawl",4)==0 ) return 0; /* "crawler" */
372 /* Anything that puts a URL in the UserAgent string is probably a bot */
373 if( c=='h' && memcmp(&zAgent[i],"http",4)==0 ) return 0;
374 if( c=='c' && seenCompatible==0 && memcmp(&zAgent[i],"compatible",11)==0 ){
375 seenCompatible = i;
376 i+=10;
377 }
378 }
379 if( memcmp(zAgent, "Mozilla/", 8)==0 ){
380 if( atoi(&zAgent[8])<4 ) return 0; /* Many bots advertise as Mozilla/3 */
381 if( seenCompatible
382 && memcmp(&zAgent[seenCompatible],"compatible;_MSIE_", 18)!=0
383 ){
384 /* If it claims to be Mozilla compatible and it isn't MSIE, then it
385 ** is probably a bot */
386 return 0;
387 }
388 return 1;
389 }
390 if( memcmp(zAgent, "Opera/", 6)==0 ) return 1;
391 if( memcmp(zAgent, "Safari/", 7)==0 ) return 1;
392 if( memcmp(zAgent, "Lynx/", 5)==0 ) return 1;
393 return 0;
394
--- src/login.c
+++ src/login.c
@@ -352,42 +352,50 @@
352 ** downstream problems here. We could alternately use "" here.
353 */
354 ;
355 }
356 }
357
358 /*
359 ** Return true if the prefix of zStr matches zPattern. Return false if
360 ** they are different.
361 **
362 ** A lowercase character in zPattern will match either upper or lower
363 ** case in zStr. But an uppercase in zPattern will only match an
364 ** uppercase in zStr.
365 */
366 static int prefix_match(const char *zPattern, const char *zStr){
367 int i;
368 char c;
369 for(i=0; (c = zPattern[i])!=0; i++){
370 if( zStr[i]!=c && fossil_tolower(zStr[i])!=c ) return 0;
371 }
372 return 1;
373 }
374
375 /*
376 ** Look at the HTTP_USER_AGENT parameter and try to determine if the user agent
377 ** is a manually operated browser or a bot. When in doubt, assume a bot.
378 ** Return true if we believe the agent is a real person.
379 */
380 static int isHuman(const char *zAgent){
381 int i;
 
382 if( zAgent==0 ) return 0; /* If not UserAgent, the probably a bot */
383 for(i=0; zAgent[i]; i++){
384 if( prefix_match("bot", zAgent+i) ) return 0;
385 if( prefix_match("spider", zAgent+i) ) return 0;
386 if( prefix_match("crawl", zAgent+i) ) return 0;
387 /* If a URI appears in the User-Agent, it is probably a bot */
388 if( memcmp("http", zAgent+i,4)==0 ) return 0;
 
 
 
 
 
389 }
390 if( memcmp(zAgent, "Mozilla/", 8)==0 ){
391 if( atoi(&zAgent[8])<4 ) return 0; /* Many bots advertise as Mozilla/3 */
392 if( strglob("*Firefox/[1-9]*", zAgent) ) return 1;
393 if( strglob("*Chrome/[1-9]*", zAgent) ) return 1;
394 if( strglob("*(compatible;?MSIE?[1-9]*", zAgent) ) return 1;
395 if( strglob("*AppleWebKit/[1-9]*(KHTML*", zAgent) ) return 1;
396 return 0;
 
 
 
397 }
398 if( memcmp(zAgent, "Opera/", 6)==0 ) return 1;
399 if( memcmp(zAgent, "Safari/", 7)==0 ) return 1;
400 if( memcmp(zAgent, "Lynx/", 5)==0 ) return 1;
401 return 0;
402

Keyboard Shortcuts

Open search /
Next entry (timeline) j
Previous entry (timeline) k
Open focused entry Enter
Show this help ?
Toggle theme Top nav button