| | @@ -264,10 +264,28 @@ |
| 264 | 264 | ** particularly difficult to compute. In all other case, the tag should |
| 265 | 265 | ** exactly match the page name. |
| 266 | 266 | ** |
| 267 | 267 | ** Change this setting "off" to disable all robot restrictions. |
| 268 | 268 | */ |
| 269 | +/* |
| 270 | +** SETTING: robot-exception width=40 block-text |
| 271 | +** |
| 272 | +** The value of this setting should be a regular expression. |
| 273 | +** If it matches the REQUEST_URI without the SCRIPT_NAME prefix |
| 274 | +** matches this regular expression, then the request is an exception |
| 275 | +** to anti-robot defenses and should be allowed through. For |
| 276 | +** example, to allow robots to download tarballs or ZIP archives |
| 277 | +** for named versions and releases, you could use an expression like |
| 278 | +** this: |
| 279 | +** |
| 280 | +** ^/(tarball|zip)\\b*\\b(version-|release)\\b |
| 281 | +** |
| 282 | +** This setting can hold multiple regular expressions, one |
| 283 | +** regular expression per line. The input URL is exempted from |
| 284 | +** anti-robot defenses if any of the multiple regular expressions |
| 285 | +** matches. |
| 286 | +*/ |
| 269 | 287 | |
| 270 | 288 | /* |
| 271 | 289 | ** Return the default restriction GLOB |
| 272 | 290 | */ |
| 273 | 291 | const char *robot_restrict_default(void){ |
| | @@ -287,10 +305,81 @@ |
| 287 | 305 | if( zGlob[0]==0 || fossil_strcmp(zGlob, "off")==0 ){ |
| 288 | 306 | return 0; |
| 289 | 307 | } |
| 290 | 308 | return glob_multi_match(zGlob,zTag); |
| 291 | 309 | } |
| 310 | + |
| 311 | +/* |
| 312 | +** Check the request URI to see if it matches one of the URI |
| 313 | +** exceptions listed in the robot-exception setting. Return true |
| 314 | +** if it does. Return false if it does not. |
| 315 | +** |
| 316 | +** For the purposes of this routine, the "request URI" means |
| 317 | +** the REQUEST_URI value with the SCRIPT_NAME prefix removed and |
| 318 | +** with QUERY_STRING appended with a "?" separator if QUERY_STRING |
| 319 | +** is not empty. |
| 320 | +** |
| 321 | +** If the robot-exception setting does not exist or is an empty |
| 322 | +** string, then return false. |
| 323 | +*/ |
| 324 | +int robot_exception(void){ |
| 325 | + const char *zRE = db_get("robot-exception",0); |
| 326 | + const char *zQS; /* QUERY_STRING */ |
| 327 | + const char *zURI; /* REQUEST_URI */ |
| 328 | + const char *zSN; /* SCRIPT_NAME */ |
| 329 | + const char *zNL; /* Next newline character */ |
| 330 | + char *zRequest; /* REQUEST_URL w/o SCRIPT_NAME prefix + QUERY_STRING */ |
| 331 | + int nRequest; /* Length of zRequest in bytes */ |
| 332 | + size_t nURI, nSN; /* Length of zURI and zSN */ |
| 333 | + int bMatch = 0; /* True if there is a match */ |
| 334 | + |
| 335 | + if( zRE==0 ) return 0; |
| 336 | + if( zRE[0]==0 ) return 0; |
| 337 | + zURI = PD("REQUEST_URI",""); |
| 338 | + nURI = strlen(zURI); |
| 339 | + zSN = PD("SCRIPT_NAME",""); |
| 340 | + nSN = strlen(zSN); |
| 341 | + if( nSN<=nURI ) zURI += nSN; |
| 342 | + zQS = P("QUERY_STRING"); |
| 343 | + if( zQS && zQS[0] ){ |
| 344 | + zRequest = mprintf("%s?%s", zURI, zQS); |
| 345 | + }else{ |
| 346 | + zRequest = fossil_strdup(zURI); |
| 347 | + } |
| 348 | + nRequest = (int)strlen(zRequest); |
| 349 | + while( zRE[0] && bMatch==0 ){ |
| 350 | + char *z; |
| 351 | + const char *zErr; |
| 352 | + size_t n; |
| 353 | + ReCompiled *pRe; |
| 354 | + zNL = strchr(zRE,'\n'); |
| 355 | + if( zNL ){ |
| 356 | + n = (size_t)(zNL - zRE)+1; |
| 357 | + while( zNL>zRE && fossil_isspace(zNL[0]) ) zNL--; |
| 358 | + if( zNL==zRE ){ |
| 359 | + zRE += n; |
| 360 | + continue; |
| 361 | + } |
| 362 | + }else{ |
| 363 | + n = strlen(zRE); |
| 364 | + } |
| 365 | + z = mprintf("%.*s", (int)(zNL - zRE)+1, zRE); |
| 366 | + zRE += n; |
| 367 | + zErr = re_compile(&pRe, z, 0); |
| 368 | + if( zErr ){ |
| 369 | + fossil_warning("robot-exception error \"%s\" in expression \"%s\"\n", |
| 370 | + zErr, z); |
| 371 | + fossil_free(z); |
| 372 | + continue; |
| 373 | + } |
| 374 | + fossil_free(z); |
| 375 | + bMatch = re_match(pRe, (const unsigned char*)zRequest, nRequest); |
| 376 | + re_free(pRe); |
| 377 | + } |
| 378 | + fossil_free(zRequest); |
| 379 | + return bMatch; |
| 380 | +} |
| 292 | 381 | |
| 293 | 382 | /* |
| 294 | 383 | ** Check to see if the page named in the argument is on the |
| 295 | 384 | ** robot-restrict list. If it is on the list and if the user |
| 296 | 385 | ** is "nobody" then bring up a captcha to test to make sure that |
| | @@ -302,10 +391,14 @@ |
| 302 | 391 | */ |
| 303 | 392 | int robot_restrict(const char *zTag){ |
| 304 | 393 | if( robot.resultCache==KNOWN_NOT_ROBOT ) return 0; |
| 305 | 394 | if( !robot_restrict_has_tag(zTag) ) return 0; |
| 306 | 395 | if( !client_might_be_a_robot() ) return 0; |
| 396 | + if( robot_exception() ){ |
| 397 | + robot.resultCache = KNOWN_NOT_ROBOT; |
| 398 | + return 0; |
| 399 | + } |
| 307 | 400 | |
| 308 | 401 | /* Generate the proof-of-work captcha */ |
| 309 | 402 | ask_for_proof_that_client_is_not_robot(); |
| 310 | 403 | return 1; |
| 311 | 404 | } |
| 312 | 405 | |