Fossil SCM
Improvements to the User-Agent bot recognizer. Add the test-ishuman command for testing the bot recognizer.
Commit
06e0cb70054d3c3e303a808563ca6c1ab2b86c2b
Parent
98cc52065294a49…
1 file changed
+20
-4
+20
-4
| --- src/login.c | ||
| +++ src/login.c | ||
| @@ -203,23 +203,39 @@ | ||
| 203 | 203 | /* |
| 204 | 204 | ** Look at the HTTP_USER_AGENT parameter and try to determine if the user agent |
| 205 | 205 | ** is a manually operated browser or a bot. When in doubt, assume a bot. Return |
| 206 | 206 | ** true if we believe the agent is a real person. |
| 207 | 207 | */ |
| 208 | -static int isHuman(void){ | |
| 209 | - const char *zAgent = P("HTTP_USER_AGENT"); | |
| 208 | +static int isHuman(const char *zAgent){ | |
| 210 | 209 | int i; |
| 211 | 210 | if( zAgent==0 ) return 0; |
| 212 | 211 | for(i=0; zAgent[i]; i++){ |
| 213 | 212 | if( zAgent[i]=='b' && memcmp(&zAgent[i],"bot",3)==0 ) return 0; |
| 214 | 213 | if( zAgent[i]=='s' && memcmp(&zAgent[i],"spider",6)==0 ) return 0; |
| 215 | 214 | } |
| 216 | - if( memcmp(zAgent, "Mozilla/", 8)==0 ) return 1; | |
| 215 | + if( memcmp(zAgent, "Mozilla/", 8)==0 ){ | |
| 216 | + return atoi(&zAgent[8])>=4; | |
| 217 | + } | |
| 217 | 218 | if( memcmp(zAgent, "Opera/", 6)==0 ) return 1; |
| 218 | 219 | if( memcmp(zAgent, "Safari/", 7)==0 ) return 1; |
| 220 | + if( memcmp(zAgent, "Lynx/", 5)==0 ) return 1; | |
| 219 | 221 | return 0; |
| 220 | 222 | } |
| 223 | + | |
| 224 | +/* | |
| 225 | +** COMMAND: test-ishuman | |
| 226 | +** | |
| 227 | +** Read lines of text from standard input. Interpret each line of text | |
| 228 | +** as a User-Agent string from an HTTP header. Label each line as HUMAN | |
| 229 | +** or ROBOT. | |
| 230 | +*/ | |
| 231 | +void test_ishuman(void){ | |
| 232 | + char zLine[3000]; | |
| 233 | + while( fgets(zLine, sizeof(zLine), stdin) ){ | |
| 234 | + fossil_print("%s %s", isHuman(zLine) ? "HUMAN" : "ROBOT", zLine); | |
| 235 | + } | |
| 236 | +} | |
| 221 | 237 | |
| 222 | 238 | /* |
| 223 | 239 | ** SQL function for constant time comparison of two values. |
| 224 | 240 | ** Sets result to 0 if two values are equal. |
| 225 | 241 | */ |
| @@ -747,11 +763,11 @@ | ||
| 747 | 763 | |
| 748 | 764 | /* Set the capabilities */ |
| 749 | 765 | login_set_capabilities(zCap, 0); |
| 750 | 766 | login_set_anon_nobody_capabilities(); |
| 751 | 767 | if( zCap[0] && !g.perm.History && db_get_boolean("auto-enable-hyperlinks",1) |
| 752 | - && isHuman() ){ | |
| 768 | + && isHuman(P("HTTP_USER_AGENT")) ){ | |
| 753 | 769 | g.perm.History = 1; |
| 754 | 770 | } |
| 755 | 771 | } |
| 756 | 772 | |
| 757 | 773 | /* |
| 758 | 774 |
| --- src/login.c | |
| +++ src/login.c | |
| @@ -203,23 +203,39 @@ | |
| 203 | /* |
| 204 | ** Look at the HTTP_USER_AGENT parameter and try to determine if the user agent |
| 205 | ** is a manually operated browser or a bot. When in doubt, assume a bot. Return |
| 206 | ** true if we believe the agent is a real person. |
| 207 | */ |
| 208 | static int isHuman(void){ |
| 209 | const char *zAgent = P("HTTP_USER_AGENT"); |
| 210 | int i; |
| 211 | if( zAgent==0 ) return 0; |
| 212 | for(i=0; zAgent[i]; i++){ |
| 213 | if( zAgent[i]=='b' && memcmp(&zAgent[i],"bot",3)==0 ) return 0; |
| 214 | if( zAgent[i]=='s' && memcmp(&zAgent[i],"spider",6)==0 ) return 0; |
| 215 | } |
| 216 | if( memcmp(zAgent, "Mozilla/", 8)==0 ) return 1; |
| 217 | if( memcmp(zAgent, "Opera/", 6)==0 ) return 1; |
| 218 | if( memcmp(zAgent, "Safari/", 7)==0 ) return 1; |
| 219 | return 0; |
| 220 | } |
| 221 | |
| 222 | /* |
| 223 | ** SQL function for constant time comparison of two values. |
| 224 | ** Sets result to 0 if two values are equal. |
| 225 | */ |
| @@ -747,11 +763,11 @@ | |
| 747 | |
| 748 | /* Set the capabilities */ |
| 749 | login_set_capabilities(zCap, 0); |
| 750 | login_set_anon_nobody_capabilities(); |
| 751 | if( zCap[0] && !g.perm.History && db_get_boolean("auto-enable-hyperlinks",1) |
| 752 | && isHuman() ){ |
| 753 | g.perm.History = 1; |
| 754 | } |
| 755 | } |
| 756 | |
| 757 | /* |
| 758 |
| --- src/login.c | |
| +++ src/login.c | |
| @@ -203,23 +203,39 @@ | |
| 203 | /* |
| 204 | ** Look at the HTTP_USER_AGENT parameter and try to determine if the user agent |
| 205 | ** is a manually operated browser or a bot. When in doubt, assume a bot. Return |
| 206 | ** true if we believe the agent is a real person. |
| 207 | */ |
| 208 | static int isHuman(const char *zAgent){ |
| 209 | int i; |
| 210 | if( zAgent==0 ) return 0; |
| 211 | for(i=0; zAgent[i]; i++){ |
| 212 | if( zAgent[i]=='b' && memcmp(&zAgent[i],"bot",3)==0 ) return 0; |
| 213 | if( zAgent[i]=='s' && memcmp(&zAgent[i],"spider",6)==0 ) return 0; |
| 214 | } |
| 215 | if( memcmp(zAgent, "Mozilla/", 8)==0 ){ |
| 216 | return atoi(&zAgent[8])>=4; |
| 217 | } |
| 218 | if( memcmp(zAgent, "Opera/", 6)==0 ) return 1; |
| 219 | if( memcmp(zAgent, "Safari/", 7)==0 ) return 1; |
| 220 | if( memcmp(zAgent, "Lynx/", 5)==0 ) return 1; |
| 221 | return 0; |
| 222 | } |
| 223 | |
| 224 | /* |
| 225 | ** COMMAND: test-ishuman |
| 226 | ** |
| 227 | ** Read lines of text from standard input. Interpret each line of text |
| 228 | ** as a User-Agent string from an HTTP header. Label each line as HUMAN |
| 229 | ** or ROBOT. |
| 230 | */ |
| 231 | void test_ishuman(void){ |
| 232 | char zLine[3000]; |
| 233 | while( fgets(zLine, sizeof(zLine), stdin) ){ |
| 234 | fossil_print("%s %s", isHuman(zLine) ? "HUMAN" : "ROBOT", zLine); |
| 235 | } |
| 236 | } |
| 237 | |
| 238 | /* |
| 239 | ** SQL function for constant time comparison of two values. |
| 240 | ** Sets result to 0 if two values are equal. |
| 241 | */ |
| @@ -747,11 +763,11 @@ | |
| 763 | |
| 764 | /* Set the capabilities */ |
| 765 | login_set_capabilities(zCap, 0); |
| 766 | login_set_anon_nobody_capabilities(); |
| 767 | if( zCap[0] && !g.perm.History && db_get_boolean("auto-enable-hyperlinks",1) |
| 768 | && isHuman(P("HTTP_USER_AGENT")) ){ |
| 769 | g.perm.History = 1; |
| 770 | } |
| 771 | } |
| 772 | |
| 773 | /* |
| 774 |