Fossil SCM

Add the robot-exception setting.

drh 2025-08-21 14:08 trunk
Commit 86b6ef7fe3424284a38f9706482c58b4e65bd39d2f57ee0bdeb52065440e37a6
3 files changed +18 -5 +93 +17 -2
+18 -5
--- src/regexp.c
+++ src/regexp.c
@@ -850,25 +850,38 @@
850850
** Run a regular expression match over the named disk files, or against
851851
** standard input if no disk files are named on the command-line.
852852
**
853853
** Options:
854854
** -i|--ignore-case Ignore case
855
+** --robot-exception Use the robot-exception setting as the REGEXP
855856
*/
856857
void re_test_grep(void){
857858
ReCompiled *pRe;
858859
const char *zErr;
860
+ int iFileList = 3;
859861
int ignoreCase = find_option("ignore-case","i",0)!=0;
860
- if( g.argc<3 ){
861
- usage("REGEXP [FILE...]");
862
+ int bRobot = find_option("robot-exception",0,0)!=0;
863
+ if( bRobot ){
864
+ const char *zRe;
865
+ db_find_and_open_repository(0,0);
866
+ verify_all_options();
867
+ zRe = db_get("robot-exception","^$");
868
+ zErr = re_compile(&pRe, zRe, ignoreCase);
869
+ iFileList = 2;
870
+ }else{
871
+ verify_all_options();
872
+ if( g.argc<3 ){
873
+ usage("REGEXP [FILE...]");
874
+ }
875
+ zErr = re_compile(&pRe, g.argv[2], ignoreCase);
862876
}
863
- zErr = re_compile(&pRe, g.argv[2], ignoreCase);
864877
if( zErr ) fossil_fatal("%s", zErr);
865
- if( g.argc==3 ){
878
+ if( g.argc==iFileList ){
866879
grep_file(pRe, "-", stdin);
867880
}else{
868881
int i;
869
- for(i=3; i<g.argc; i++){
882
+ for(i=iFileList; i<g.argc; i++){
870883
FILE *in = fossil_fopen(g.argv[i], "rb");
871884
if( in==0 ){
872885
fossil_warning("cannot open \"%s\"", g.argv[i]);
873886
}else{
874887
grep_file(pRe, g.argv[i], in);
875888
--- src/regexp.c
+++ src/regexp.c
@@ -850,25 +850,38 @@
850 ** Run a regular expression match over the named disk files, or against
851 ** standard input if no disk files are named on the command-line.
852 **
853 ** Options:
854 ** -i|--ignore-case Ignore case
 
855 */
856 void re_test_grep(void){
857 ReCompiled *pRe;
858 const char *zErr;
 
859 int ignoreCase = find_option("ignore-case","i",0)!=0;
860 if( g.argc<3 ){
861 usage("REGEXP [FILE...]");
 
 
 
 
 
 
 
 
 
 
 
 
862 }
863 zErr = re_compile(&pRe, g.argv[2], ignoreCase);
864 if( zErr ) fossil_fatal("%s", zErr);
865 if( g.argc==3 ){
866 grep_file(pRe, "-", stdin);
867 }else{
868 int i;
869 for(i=3; i<g.argc; i++){
870 FILE *in = fossil_fopen(g.argv[i], "rb");
871 if( in==0 ){
872 fossil_warning("cannot open \"%s\"", g.argv[i]);
873 }else{
874 grep_file(pRe, g.argv[i], in);
875
--- src/regexp.c
+++ src/regexp.c
@@ -850,25 +850,38 @@
850 ** Run a regular expression match over the named disk files, or against
851 ** standard input if no disk files are named on the command-line.
852 **
853 ** Options:
854 ** -i|--ignore-case Ignore case
855 ** --robot-exception Use the robot-exception setting as the REGEXP
856 */
857 void re_test_grep(void){
858 ReCompiled *pRe;
859 const char *zErr;
860 int iFileList = 3;
861 int ignoreCase = find_option("ignore-case","i",0)!=0;
862 int bRobot = find_option("robot-exception",0,0)!=0;
863 if( bRobot ){
864 const char *zRe;
865 db_find_and_open_repository(0,0);
866 verify_all_options();
867 zRe = db_get("robot-exception","^$");
868 zErr = re_compile(&pRe, zRe, ignoreCase);
869 iFileList = 2;
870 }else{
871 verify_all_options();
872 if( g.argc<3 ){
873 usage("REGEXP [FILE...]");
874 }
875 zErr = re_compile(&pRe, g.argv[2], ignoreCase);
876 }
 
877 if( zErr ) fossil_fatal("%s", zErr);
878 if( g.argc==iFileList ){
879 grep_file(pRe, "-", stdin);
880 }else{
881 int i;
882 for(i=iFileList; i<g.argc; i++){
883 FILE *in = fossil_fopen(g.argv[i], "rb");
884 if( in==0 ){
885 fossil_warning("cannot open \"%s\"", g.argv[i]);
886 }else{
887 grep_file(pRe, g.argv[i], in);
888
+93
--- src/robot.c
+++ src/robot.c
@@ -264,10 +264,28 @@
264264
** particularly difficult to compute. In all other case, the tag should
265265
** exactly match the page name.
266266
**
267267
** Change this setting "off" to disable all robot restrictions.
268268
*/
269
+/*
270
+** SETTING: robot-exception width=40 block-text
271
+**
272
+** The value of this setting should be a regular expression.
273
+** If it matches the REQUEST_URI without the SCRIPT_NAME prefix
274
+** matches this regular expression, then the request is an exception
275
+** to anti-robot defenses and should be allowed through. For
276
+** example, to allow robots to download tarballs or ZIP archives
277
+** for named versions and releases, you could use an expression like
278
+** this:
279
+**
280
+** ^/(tarball|zip)\\b*\\b(version-|release)\\b
281
+**
282
+** This setting can hold multiple regular expressions, one
283
+** regular expression per line. The input URL is exempted from
284
+** anti-robot defenses if any of the multiple regular expressions
285
+** matches.
286
+*/
269287
270288
/*
271289
** Return the default restriction GLOB
272290
*/
273291
const char *robot_restrict_default(void){
@@ -287,10 +305,81 @@
287305
if( zGlob[0]==0 || fossil_strcmp(zGlob, "off")==0 ){
288306
return 0;
289307
}
290308
return glob_multi_match(zGlob,zTag);
291309
}
310
+
311
+/*
312
+** Check the request URI to see if it matches one of the URI
313
+** exceptions listed in the robot-exception setting. Return true
314
+** if it does. Return false if it does not.
315
+**
316
+** For the purposes of this routine, the "request URI" means
317
+** the REQUEST_URI value with the SCRIPT_NAME prefix removed and
318
+** with QUERY_STRING appended with a "?" separator if QUERY_STRING
319
+** is not empty.
320
+**
321
+** If the robot-exception setting does not exist or is an empty
322
+** string, then return false.
323
+*/
324
+int robot_exception(void){
325
+ const char *zRE = db_get("robot-exception",0);
326
+ const char *zQS; /* QUERY_STRING */
327
+ const char *zURI; /* REQUEST_URI */
328
+ const char *zSN; /* SCRIPT_NAME */
329
+ const char *zNL; /* Next newline character */
330
+ char *zRequest; /* REQUEST_URL w/o SCRIPT_NAME prefix + QUERY_STRING */
331
+ int nRequest; /* Length of zRequest in bytes */
332
+ size_t nURI, nSN; /* Length of zURI and zSN */
333
+ int bMatch = 0; /* True if there is a match */
334
+
335
+ if( zRE==0 ) return 0;
336
+ if( zRE[0]==0 ) return 0;
337
+ zURI = PD("REQUEST_URI","");
338
+ nURI = strlen(zURI);
339
+ zSN = PD("SCRIPT_NAME","");
340
+ nSN = strlen(zSN);
341
+ if( nSN<=nURI ) zURI += nSN;
342
+ zQS = P("QUERY_STRING");
343
+ if( zQS && zQS[0] ){
344
+ zRequest = mprintf("%s?%s", zURI, zQS);
345
+ }else{
346
+ zRequest = fossil_strdup(zURI);
347
+ }
348
+ nRequest = (int)strlen(zRequest);
349
+ while( zRE[0] && bMatch==0 ){
350
+ char *z;
351
+ const char *zErr;
352
+ size_t n;
353
+ ReCompiled *pRe;
354
+ zNL = strchr(zRE,'\n');
355
+ if( zNL ){
356
+ n = (size_t)(zNL - zRE)+1;
357
+ while( zNL>zRE && fossil_isspace(zNL[0]) ) zNL--;
358
+ if( zNL==zRE ){
359
+ zRE += n;
360
+ continue;
361
+ }
362
+ }else{
363
+ n = strlen(zRE);
364
+ }
365
+ z = mprintf("%.*s", (int)(zNL - zRE)+1, zRE);
366
+ zRE += n;
367
+ zErr = re_compile(&pRe, z, 0);
368
+ if( zErr ){
369
+ fossil_warning("robot-exception error \"%s\" in expression \"%s\"\n",
370
+ zErr, z);
371
+ fossil_free(z);
372
+ continue;
373
+ }
374
+ fossil_free(z);
375
+ bMatch = re_match(pRe, (const unsigned char*)zRequest, nRequest);
376
+ re_free(pRe);
377
+ }
378
+ fossil_free(zRequest);
379
+ return bMatch;
380
+}
292381
293382
/*
294383
** Check to see if the page named in the argument is on the
295384
** robot-restrict list. If it is on the list and if the user
296385
** is "nobody" then bring up a captcha to test to make sure that
@@ -302,10 +391,14 @@
302391
*/
303392
int robot_restrict(const char *zTag){
304393
if( robot.resultCache==KNOWN_NOT_ROBOT ) return 0;
305394
if( !robot_restrict_has_tag(zTag) ) return 0;
306395
if( !client_might_be_a_robot() ) return 0;
396
+ if( robot_exception() ){
397
+ robot.resultCache = KNOWN_NOT_ROBOT;
398
+ return 0;
399
+ }
307400
308401
/* Generate the proof-of-work captcha */
309402
ask_for_proof_that_client_is_not_robot();
310403
return 1;
311404
}
312405
--- src/robot.c
+++ src/robot.c
@@ -264,10 +264,28 @@
264 ** particularly difficult to compute. In all other case, the tag should
265 ** exactly match the page name.
266 **
267 ** Change this setting "off" to disable all robot restrictions.
268 */
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
269
270 /*
271 ** Return the default restriction GLOB
272 */
273 const char *robot_restrict_default(void){
@@ -287,10 +305,81 @@
287 if( zGlob[0]==0 || fossil_strcmp(zGlob, "off")==0 ){
288 return 0;
289 }
290 return glob_multi_match(zGlob,zTag);
291 }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
292
293 /*
294 ** Check to see if the page named in the argument is on the
295 ** robot-restrict list. If it is on the list and if the user
296 ** is "nobody" then bring up a captcha to test to make sure that
@@ -302,10 +391,14 @@
302 */
303 int robot_restrict(const char *zTag){
304 if( robot.resultCache==KNOWN_NOT_ROBOT ) return 0;
305 if( !robot_restrict_has_tag(zTag) ) return 0;
306 if( !client_might_be_a_robot() ) return 0;
 
 
 
 
307
308 /* Generate the proof-of-work captcha */
309 ask_for_proof_that_client_is_not_robot();
310 return 1;
311 }
312
--- src/robot.c
+++ src/robot.c
@@ -264,10 +264,28 @@
264 ** particularly difficult to compute. In all other case, the tag should
265 ** exactly match the page name.
266 **
267 ** Change this setting "off" to disable all robot restrictions.
268 */
269 /*
270 ** SETTING: robot-exception width=40 block-text
271 **
272 ** The value of this setting should be a regular expression.
273 ** If it matches the REQUEST_URI without the SCRIPT_NAME prefix
274 ** matches this regular expression, then the request is an exception
275 ** to anti-robot defenses and should be allowed through. For
276 ** example, to allow robots to download tarballs or ZIP archives
277 ** for named versions and releases, you could use an expression like
278 ** this:
279 **
280 ** ^/(tarball|zip)\\b*\\b(version-|release)\\b
281 **
282 ** This setting can hold multiple regular expressions, one
283 ** regular expression per line. The input URL is exempted from
284 ** anti-robot defenses if any of the multiple regular expressions
285 ** matches.
286 */
287
288 /*
289 ** Return the default restriction GLOB
290 */
291 const char *robot_restrict_default(void){
@@ -287,10 +305,81 @@
305 if( zGlob[0]==0 || fossil_strcmp(zGlob, "off")==0 ){
306 return 0;
307 }
308 return glob_multi_match(zGlob,zTag);
309 }
310
311 /*
312 ** Check the request URI to see if it matches one of the URI
313 ** exceptions listed in the robot-exception setting. Return true
314 ** if it does. Return false if it does not.
315 **
316 ** For the purposes of this routine, the "request URI" means
317 ** the REQUEST_URI value with the SCRIPT_NAME prefix removed and
318 ** with QUERY_STRING appended with a "?" separator if QUERY_STRING
319 ** is not empty.
320 **
321 ** If the robot-exception setting does not exist or is an empty
322 ** string, then return false.
323 */
324 int robot_exception(void){
325 const char *zRE = db_get("robot-exception",0);
326 const char *zQS; /* QUERY_STRING */
327 const char *zURI; /* REQUEST_URI */
328 const char *zSN; /* SCRIPT_NAME */
329 const char *zNL; /* Next newline character */
330 char *zRequest; /* REQUEST_URL w/o SCRIPT_NAME prefix + QUERY_STRING */
331 int nRequest; /* Length of zRequest in bytes */
332 size_t nURI, nSN; /* Length of zURI and zSN */
333 int bMatch = 0; /* True if there is a match */
334
335 if( zRE==0 ) return 0;
336 if( zRE[0]==0 ) return 0;
337 zURI = PD("REQUEST_URI","");
338 nURI = strlen(zURI);
339 zSN = PD("SCRIPT_NAME","");
340 nSN = strlen(zSN);
341 if( nSN<=nURI ) zURI += nSN;
342 zQS = P("QUERY_STRING");
343 if( zQS && zQS[0] ){
344 zRequest = mprintf("%s?%s", zURI, zQS);
345 }else{
346 zRequest = fossil_strdup(zURI);
347 }
348 nRequest = (int)strlen(zRequest);
349 while( zRE[0] && bMatch==0 ){
350 char *z;
351 const char *zErr;
352 size_t n;
353 ReCompiled *pRe;
354 zNL = strchr(zRE,'\n');
355 if( zNL ){
356 n = (size_t)(zNL - zRE)+1;
357 while( zNL>zRE && fossil_isspace(zNL[0]) ) zNL--;
358 if( zNL==zRE ){
359 zRE += n;
360 continue;
361 }
362 }else{
363 n = strlen(zRE);
364 }
365 z = mprintf("%.*s", (int)(zNL - zRE)+1, zRE);
366 zRE += n;
367 zErr = re_compile(&pRe, z, 0);
368 if( zErr ){
369 fossil_warning("robot-exception error \"%s\" in expression \"%s\"\n",
370 zErr, z);
371 fossil_free(z);
372 continue;
373 }
374 fossil_free(z);
375 bMatch = re_match(pRe, (const unsigned char*)zRequest, nRequest);
376 re_free(pRe);
377 }
378 fossil_free(zRequest);
379 return bMatch;
380 }
381
382 /*
383 ** Check to see if the page named in the argument is on the
384 ** robot-restrict list. If it is on the list and if the user
385 ** is "nobody" then bring up a captcha to test to make sure that
@@ -302,10 +391,14 @@
391 */
392 int robot_restrict(const char *zTag){
393 if( robot.resultCache==KNOWN_NOT_ROBOT ) return 0;
394 if( !robot_restrict_has_tag(zTag) ) return 0;
395 if( !client_might_be_a_robot() ) return 0;
396 if( robot_exception() ){
397 robot.resultCache = KNOWN_NOT_ROBOT;
398 return 0;
399 }
400
401 /* Generate the proof-of-work captcha */
402 ask_for_proof_that_client_is_not_robot();
403 return 1;
404 }
405
+17 -2
--- src/setup.c
+++ src/setup.c
@@ -477,12 +477,12 @@
477477
@
478478
@ <form action="%R/setup_robot" method="post"><div>
479479
login_insert_csrf_secret();
480480
@ <input type="submit" name="submit" value="Apply Changes"></p>
481481
@ <hr>
482
- @ <p><b>Do not allow robots access to these pages.</b>
483
- @ <p> If the page name matches the GLOB pattern of this setting, and the
482
+ @ <p><b>Do not allow robots access to these pages.</b><br>
483
+ @ If the page name matches the GLOB pattern of this setting, and the
484484
@ users is "nobody", and the client has not previously passed a captcha
485485
@ test to show that it is not a robot, then the page is not displayed.
486486
@ A captcha test is is rendered instead.
487487
@ The recommended value for this setting is:
488488
@ <p>
@@ -499,10 +499,25 @@
499499
@ (Property: robot-restrict)
500500
@ <br>
501501
textarea_attribute("", 2, 80,
502502
"robot-restrict", "rbrestrict", robot_restrict_default(), 0);
503503
504
+ @ <hr>
505
+ @ <p><b>Exceptions to anti-robot restrictions</b><br>
506
+ @ The entry below is a list of regular expressions, one per line.
507
+ @ If any of these regular expressions match the input URL, then the
508
+ @ request is exempt from anti-robot defenses. Use this, for example,
509
+ @ to allow scripts to download release tarballs using a pattern
510
+ @ like:</p>
511
+ @ <p>
512
+ @ &emsp;&emsp;<tt>^/tarball\\b*\\b(version-|release)\\b</tt>
513
+ @ <p>The pattern should match against the REQUEST_URI with the
514
+ @ SCRIPT_NAME prefix removed, and with QUERY_STRING appended following
515
+ @ a "?" if QUERY_STRING exists. (Property: robot-exception)<br>
516
+ textarea_attribute("", 3, 80,
517
+ "robot-exception", "rbexcept", "", 0);
518
+
504519
@ <hr>
505520
addAutoHyperlinkSettings();
506521
507522
@ <hr>
508523
entry_attribute("Anonymous Login Validity", 11, "anon-cookie-lifespan",
509524
--- src/setup.c
+++ src/setup.c
@@ -477,12 +477,12 @@
477 @
478 @ <form action="%R/setup_robot" method="post"><div>
479 login_insert_csrf_secret();
480 @ <input type="submit" name="submit" value="Apply Changes"></p>
481 @ <hr>
482 @ <p><b>Do not allow robots access to these pages.</b>
483 @ <p> If the page name matches the GLOB pattern of this setting, and the
484 @ users is "nobody", and the client has not previously passed a captcha
485 @ test to show that it is not a robot, then the page is not displayed.
486 @ A captcha test is is rendered instead.
487 @ The recommended value for this setting is:
488 @ <p>
@@ -499,10 +499,25 @@
499 @ (Property: robot-restrict)
500 @ <br>
501 textarea_attribute("", 2, 80,
502 "robot-restrict", "rbrestrict", robot_restrict_default(), 0);
503
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
504 @ <hr>
505 addAutoHyperlinkSettings();
506
507 @ <hr>
508 entry_attribute("Anonymous Login Validity", 11, "anon-cookie-lifespan",
509
--- src/setup.c
+++ src/setup.c
@@ -477,12 +477,12 @@
477 @
478 @ <form action="%R/setup_robot" method="post"><div>
479 login_insert_csrf_secret();
480 @ <input type="submit" name="submit" value="Apply Changes"></p>
481 @ <hr>
482 @ <p><b>Do not allow robots access to these pages.</b><br>
483 @ If the page name matches the GLOB pattern of this setting, and the
484 @ users is "nobody", and the client has not previously passed a captcha
485 @ test to show that it is not a robot, then the page is not displayed.
486 @ A captcha test is is rendered instead.
487 @ The recommended value for this setting is:
488 @ <p>
@@ -499,10 +499,25 @@
499 @ (Property: robot-restrict)
500 @ <br>
501 textarea_attribute("", 2, 80,
502 "robot-restrict", "rbrestrict", robot_restrict_default(), 0);
503
504 @ <hr>
505 @ <p><b>Exceptions to anti-robot restrictions</b><br>
506 @ The entry below is a list of regular expressions, one per line.
507 @ If any of these regular expressions match the input URL, then the
508 @ request is exempt from anti-robot defenses. Use this, for example,
509 @ to allow scripts to download release tarballs using a pattern
510 @ like:</p>
511 @ <p>
512 @ &emsp;&emsp;<tt>^/tarball\\b*\\b(version-|release)\\b</tt>
513 @ <p>The pattern should match against the REQUEST_URI with the
514 @ SCRIPT_NAME prefix removed, and with QUERY_STRING appended following
515 @ a "?" if QUERY_STRING exists. (Property: robot-exception)<br>
516 textarea_attribute("", 3, 80,
517 "robot-exception", "rbexcept", "", 0);
518
519 @ <hr>
520 addAutoHyperlinkSettings();
521
522 @ <hr>
523 entry_attribute("Anonymous Login Validity", 11, "anon-cookie-lifespan",
524

Keyboard Shortcuts

Open search /
Next entry (timeline) j
Previous entry (timeline) k
Open focused entry Enter
Show this help ?
Toggle theme Top nav button