Fossil SCM

New settings to allow robots to download tarballs but only if the corresponding check-in is a leaf or if it has a tag like "release" or "allow-robots". New settings control all of the above.

drh 2025-10-09 12:55 trunk merge
Commit 4d198d0e1270ac3fb44aa3b0041d613435912dd69b0fffd6c5b521a949f4f543
+99 -4
--- src/robot.c
+++ src/robot.c
@@ -39,14 +39,19 @@
3939
unsigned int h1, h2; /* Proof-of-work hash values */
4040
unsigned int resultCache; /* 0: unknown. 1: human 2: might-be-robot */
4141
} robot = { 0, 0, 0 };
4242
4343
/*
44
-** Allowed values for robot.resultCache
44
+** Allowed values for robot.resultCache.
45
+**
46
+** The names are slightly misleading. KNOWN_NOT_ROBOT might be set even
47
+** if the client is a robot, but only if the robot is an approved robot.
48
+** A better name might be "KNOWN_NOT_UNAUTHORIZED_ROBOT", but that is too
49
+** long of a name.
4550
*/
46
-#define KNOWN_NOT_ROBOT 1
47
-#define MIGHT_BE_ROBOT 2
51
+#define KNOWN_NOT_ROBOT 1 /* Approved to consume CPU and bandwidth */
52
+#define MIGHT_BE_ROBOT 2 /* Might be an unapproved robot */
4853
4954
/*
5055
** Compute two hashes, robot.h1 and robot.h2, that are used as
5156
** part of determining whether or not the HTTP client is a robot.
5257
** These hashes are based on current time, client IP address,
@@ -265,11 +270,13 @@
265270
** The "diff" tag covers all diffing pages such as /vdiff, /fdiff, and
266271
** /vpatch. The "annotate" tag also covers /blame and /praise. "zip"
267272
** also covers /tarball and /sqlar. If a tag has an "X" character appended,
268273
** then it only applies if query parameters are such that the page is
269274
** particularly difficult to compute. In all other case, the tag should
270
-** exactly match the page name.
275
+** exactly match the page name. Useful "X" tags include "timelineX"
276
+** and "zipX". See the robot-zip-leaf and robot-zip-tag settings
277
+** for additional controls associated with the "zipX" restriction.
271278
**
272279
** Change this setting "off" to disable all robot restrictions.
273280
*/
274281
/*
275282
** SETTING: robot-exception width=40 block-text
@@ -287,10 +294,28 @@
287294
** This setting can hold multiple regular expressions, one
288295
** regular expression per line. The input URL is exempted from
289296
** anti-robot defenses if any of the multiple regular expressions
290297
** matches.
291298
*/
299
+/*
300
+** SETTING: robot-zip-leaf boolean
301
+**
302
+** If this setting is true, the robots are allowed to download tarballs,
303
+** ZIP-archives, and SQL-archives even though "zipX" is found in
304
+** the robot-restrict setting as long as the specific check-in being
305
+** downloaded is a leaf check-in.
306
+*/
307
+/*
308
+** SETTING: robot-zip-tag width=40 block-text
309
+**
310
+** If this setting is a list of GLOB patterns matching tags,
311
+** then robots are allowed to download tarballs, ZIP-archives, and
312
+** SQL-archives even though "zipX" appears in robot-restrict, as long as
313
+** the specific check-in being downloaded has a tags that matches
314
+** the GLOB list of this setting. Recommended value:
315
+** "release,robot-access".
316
+*/
292317
293318
/*
294319
** Return the default restriction GLOB
295320
*/
296321
const char *robot_restrict_default(void){
@@ -405,10 +430,67 @@
405430
406431
/* Generate the proof-of-work captcha */
407432
ask_for_proof_that_client_is_not_robot();
408433
return 1;
409434
}
435
+
436
+/*
437
+** Check to see if a robot is allowed to download a tarball, ZIP archive,
438
+** or SQL Archive for a particular check-in identified by the "rid"
439
+** argument. Return true to block the download. Return false to
440
+** continue. Prior to returning true, a captcha is presented to the user.
441
+** No output is generated when returning false.
442
+**
443
+** The rules:
444
+**
445
+** (1) If "zipX" is missing from the robot-restrict setting, then robots
446
+** are allowed to download any archive. None of the remaining rules
447
+** below are consulted unless "zipX" is on the robot-restrict setting.
448
+**
449
+** (2) If the robot-zip-leaf setting is true, then robots are allowed
450
+** to download archives for any leaf check-in. This allows URL like
451
+** /tarball/trunk/archive.tar.gz to work since branch labels like "trunk"
452
+** always resolve to a leaf.
453
+**
454
+** (3) If the robot-zip-tag setting is a comma-separated tags, then any
455
+** check-in that contains one of the tags on that list is allowed to
456
+** be downloaded. This allows check-ins with tags like "release" or
457
+** "robot-access" to be downloaded by robots.
458
+*/
459
+int robot_restrict_zip(int rid){
460
+ const char *zTag;
461
+ if( !robot_restrict_has_tag("zipX") || !client_might_be_a_robot() ){
462
+ return 0; /* Rule (1) */
463
+ }
464
+
465
+ if( db_get_boolean("robot-zip-leaf",0) && is_a_leaf(rid) ){
466
+ return 0; /* Rule (2) */
467
+ }
468
+
469
+ zTag = db_get("robot-zip-tag",0);
470
+ if( zTag && zTag[0] && fossil_strcmp(zTag,"off")!=0 ){
471
+ int ok = 0;
472
+ Stmt q;
473
+ db_prepare(&q,
474
+ "SELECT substr(tagname,5) FROM tagxref, tag"
475
+ " WHERE tagxref.rid=%d"
476
+ " AND tag.tagid=tagxref.tagid"
477
+ " AND tagxref.tagtype=1"
478
+ " AND tag.tagname GLOB 'sym-*'",
479
+ rid
480
+ );
481
+ while( !ok && db_step(&q)==SQLITE_ROW ){
482
+ if( glob_multi_match(zTag, db_column_text(&q,0)) ) ok = 1;
483
+ }
484
+ db_finalize(&q);
485
+ if( ok ) return 0; /* Rule (3) */
486
+ }
487
+
488
+ /* Generate the proof-of-work captcha */
489
+ ask_for_proof_that_client_is_not_robot();
490
+ return 1;
491
+}
410492
411493
/*
412494
** WEBPAGE: test-robotck
413495
**
414496
** Run the robot_restrict() function using the value of the "name="
@@ -416,21 +498,30 @@
416498
** logic.
417499
**
418500
** Whenever this page is successfully rendered (when it doesn't go to
419501
** the captcha) it deletes the proof-of-work cookie. So reloading the
420502
** page will reset the cookie and restart the verification.
503
+**
504
+** If the zip=CHECKIN query parameter is provided, then also invoke
505
+** robot_restrict_archive() on the RID of CHECKIN.
421506
*/
422507
void robot_restrict_test_page(void){
423508
const char *zName = P("name");
509
+ const char *zZip = P("zip");
424510
const char *zP1 = P("proof");
425511
const char *zP2 = P(ROBOT_COOKIE);
426512
const char *z;
513
+ int rid = 0;
427514
if( zName==0 || zName[0]==0 ) zName = g.zPath;
428515
login_check_credentials();
429516
if( g.zLogin==0 ){ login_needed(1); return; }
430517
g.zLogin = 0;
431518
if( robot_restrict(zName) ) return;
519
+ if( zZip && zZip[0] ){
520
+ rid = symbolic_name_to_rid(zZip, "ci");
521
+ if( rid && robot_restrict_zip(rid) ) return;
522
+ }
432523
style_set_current_feature("test");
433524
style_header("robot_restrict() test");
434525
@ <h1>Captcha passed</h1>
435526
@
436527
@ <p>
@@ -438,10 +529,14 @@
438529
@ proof=%h(zP1)<br>
439530
}
440531
if( zP2 && zP2[0] ){
441532
@ %h(ROBOT_COOKIE)=%h(zP2)<br>
442533
cgi_set_cookie(ROBOT_COOKIE,"",0,-1);
534
+ }
535
+ if( zZip && zZip[0] ){
536
+ @ zip=%h(zZip)<br>
537
+ @ rid=%d(rid)<br>
443538
}
444539
if( g.perm.Admin ){
445540
z = db_get("robot-restrict",robot_restrict_default());
446541
if( z && z[0] ){
447542
@ robot-restrict=%h(z)</br>
448543
--- src/robot.c
+++ src/robot.c
@@ -39,14 +39,19 @@
39 unsigned int h1, h2; /* Proof-of-work hash values */
40 unsigned int resultCache; /* 0: unknown. 1: human 2: might-be-robot */
41 } robot = { 0, 0, 0 };
42
43 /*
44 ** Allowed values for robot.resultCache
 
 
 
 
 
45 */
46 #define KNOWN_NOT_ROBOT 1
47 #define MIGHT_BE_ROBOT 2
48
49 /*
50 ** Compute two hashes, robot.h1 and robot.h2, that are used as
51 ** part of determining whether or not the HTTP client is a robot.
52 ** These hashes are based on current time, client IP address,
@@ -265,11 +270,13 @@
265 ** The "diff" tag covers all diffing pages such as /vdiff, /fdiff, and
266 ** /vpatch. The "annotate" tag also covers /blame and /praise. "zip"
267 ** also covers /tarball and /sqlar. If a tag has an "X" character appended,
268 ** then it only applies if query parameters are such that the page is
269 ** particularly difficult to compute. In all other case, the tag should
270 ** exactly match the page name.
 
 
271 **
272 ** Change this setting "off" to disable all robot restrictions.
273 */
274 /*
275 ** SETTING: robot-exception width=40 block-text
@@ -287,10 +294,28 @@
287 ** This setting can hold multiple regular expressions, one
288 ** regular expression per line. The input URL is exempted from
289 ** anti-robot defenses if any of the multiple regular expressions
290 ** matches.
291 */
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
292
293 /*
294 ** Return the default restriction GLOB
295 */
296 const char *robot_restrict_default(void){
@@ -405,10 +430,67 @@
405
406 /* Generate the proof-of-work captcha */
407 ask_for_proof_that_client_is_not_robot();
408 return 1;
409 }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
410
411 /*
412 ** WEBPAGE: test-robotck
413 **
414 ** Run the robot_restrict() function using the value of the "name="
@@ -416,21 +498,30 @@
416 ** logic.
417 **
418 ** Whenever this page is successfully rendered (when it doesn't go to
419 ** the captcha) it deletes the proof-of-work cookie. So reloading the
420 ** page will reset the cookie and restart the verification.
 
 
 
421 */
422 void robot_restrict_test_page(void){
423 const char *zName = P("name");
 
424 const char *zP1 = P("proof");
425 const char *zP2 = P(ROBOT_COOKIE);
426 const char *z;
 
427 if( zName==0 || zName[0]==0 ) zName = g.zPath;
428 login_check_credentials();
429 if( g.zLogin==0 ){ login_needed(1); return; }
430 g.zLogin = 0;
431 if( robot_restrict(zName) ) return;
 
 
 
 
432 style_set_current_feature("test");
433 style_header("robot_restrict() test");
434 @ <h1>Captcha passed</h1>
435 @
436 @ <p>
@@ -438,10 +529,14 @@
438 @ proof=%h(zP1)<br>
439 }
440 if( zP2 && zP2[0] ){
441 @ %h(ROBOT_COOKIE)=%h(zP2)<br>
442 cgi_set_cookie(ROBOT_COOKIE,"",0,-1);
 
 
 
 
443 }
444 if( g.perm.Admin ){
445 z = db_get("robot-restrict",robot_restrict_default());
446 if( z && z[0] ){
447 @ robot-restrict=%h(z)</br>
448
--- src/robot.c
+++ src/robot.c
@@ -39,14 +39,19 @@
39 unsigned int h1, h2; /* Proof-of-work hash values */
40 unsigned int resultCache; /* 0: unknown. 1: human 2: might-be-robot */
41 } robot = { 0, 0, 0 };
42
43 /*
44 ** Allowed values for robot.resultCache.
45 **
46 ** The names are slightly misleading. KNOWN_NOT_ROBOT might be set even
47 ** if the client is a robot, but only if the robot is an approved robot.
48 ** A better name might be "KNOWN_NOT_UNAUTHORIZED_ROBOT", but that is too
49 ** long of a name.
50 */
51 #define KNOWN_NOT_ROBOT 1 /* Approved to consume CPU and bandwidth */
52 #define MIGHT_BE_ROBOT 2 /* Might be an unapproved robot */
53
54 /*
55 ** Compute two hashes, robot.h1 and robot.h2, that are used as
56 ** part of determining whether or not the HTTP client is a robot.
57 ** These hashes are based on current time, client IP address,
@@ -265,11 +270,13 @@
270 ** The "diff" tag covers all diffing pages such as /vdiff, /fdiff, and
271 ** /vpatch. The "annotate" tag also covers /blame and /praise. "zip"
272 ** also covers /tarball and /sqlar. If a tag has an "X" character appended,
273 ** then it only applies if query parameters are such that the page is
274 ** particularly difficult to compute. In all other case, the tag should
275 ** exactly match the page name. Useful "X" tags include "timelineX"
276 ** and "zipX". See the robot-zip-leaf and robot-zip-tag settings
277 ** for additional controls associated with the "zipX" restriction.
278 **
279 ** Change this setting "off" to disable all robot restrictions.
280 */
281 /*
282 ** SETTING: robot-exception width=40 block-text
@@ -287,10 +294,28 @@
294 ** This setting can hold multiple regular expressions, one
295 ** regular expression per line. The input URL is exempted from
296 ** anti-robot defenses if any of the multiple regular expressions
297 ** matches.
298 */
299 /*
300 ** SETTING: robot-zip-leaf boolean
301 **
302 ** If this setting is true, the robots are allowed to download tarballs,
303 ** ZIP-archives, and SQL-archives even though "zipX" is found in
304 ** the robot-restrict setting as long as the specific check-in being
305 ** downloaded is a leaf check-in.
306 */
307 /*
308 ** SETTING: robot-zip-tag width=40 block-text
309 **
310 ** If this setting is a list of GLOB patterns matching tags,
311 ** then robots are allowed to download tarballs, ZIP-archives, and
312 ** SQL-archives even though "zipX" appears in robot-restrict, as long as
313 ** the specific check-in being downloaded has a tags that matches
314 ** the GLOB list of this setting. Recommended value:
315 ** "release,robot-access".
316 */
317
318 /*
319 ** Return the default restriction GLOB
320 */
321 const char *robot_restrict_default(void){
@@ -405,10 +430,67 @@
430
431 /* Generate the proof-of-work captcha */
432 ask_for_proof_that_client_is_not_robot();
433 return 1;
434 }
435
436 /*
437 ** Check to see if a robot is allowed to download a tarball, ZIP archive,
438 ** or SQL Archive for a particular check-in identified by the "rid"
439 ** argument. Return true to block the download. Return false to
440 ** continue. Prior to returning true, a captcha is presented to the user.
441 ** No output is generated when returning false.
442 **
443 ** The rules:
444 **
445 ** (1) If "zipX" is missing from the robot-restrict setting, then robots
446 ** are allowed to download any archive. None of the remaining rules
447 ** below are consulted unless "zipX" is on the robot-restrict setting.
448 **
449 ** (2) If the robot-zip-leaf setting is true, then robots are allowed
450 ** to download archives for any leaf check-in. This allows URL like
451 ** /tarball/trunk/archive.tar.gz to work since branch labels like "trunk"
452 ** always resolve to a leaf.
453 **
454 ** (3) If the robot-zip-tag setting is a comma-separated tags, then any
455 ** check-in that contains one of the tags on that list is allowed to
456 ** be downloaded. This allows check-ins with tags like "release" or
457 ** "robot-access" to be downloaded by robots.
458 */
459 int robot_restrict_zip(int rid){
460 const char *zTag;
461 if( !robot_restrict_has_tag("zipX") || !client_might_be_a_robot() ){
462 return 0; /* Rule (1) */
463 }
464
465 if( db_get_boolean("robot-zip-leaf",0) && is_a_leaf(rid) ){
466 return 0; /* Rule (2) */
467 }
468
469 zTag = db_get("robot-zip-tag",0);
470 if( zTag && zTag[0] && fossil_strcmp(zTag,"off")!=0 ){
471 int ok = 0;
472 Stmt q;
473 db_prepare(&q,
474 "SELECT substr(tagname,5) FROM tagxref, tag"
475 " WHERE tagxref.rid=%d"
476 " AND tag.tagid=tagxref.tagid"
477 " AND tagxref.tagtype=1"
478 " AND tag.tagname GLOB 'sym-*'",
479 rid
480 );
481 while( !ok && db_step(&q)==SQLITE_ROW ){
482 if( glob_multi_match(zTag, db_column_text(&q,0)) ) ok = 1;
483 }
484 db_finalize(&q);
485 if( ok ) return 0; /* Rule (3) */
486 }
487
488 /* Generate the proof-of-work captcha */
489 ask_for_proof_that_client_is_not_robot();
490 return 1;
491 }
492
493 /*
494 ** WEBPAGE: test-robotck
495 **
496 ** Run the robot_restrict() function using the value of the "name="
@@ -416,21 +498,30 @@
498 ** logic.
499 **
500 ** Whenever this page is successfully rendered (when it doesn't go to
501 ** the captcha) it deletes the proof-of-work cookie. So reloading the
502 ** page will reset the cookie and restart the verification.
503 **
504 ** If the zip=CHECKIN query parameter is provided, then also invoke
505 ** robot_restrict_archive() on the RID of CHECKIN.
506 */
507 void robot_restrict_test_page(void){
508 const char *zName = P("name");
509 const char *zZip = P("zip");
510 const char *zP1 = P("proof");
511 const char *zP2 = P(ROBOT_COOKIE);
512 const char *z;
513 int rid = 0;
514 if( zName==0 || zName[0]==0 ) zName = g.zPath;
515 login_check_credentials();
516 if( g.zLogin==0 ){ login_needed(1); return; }
517 g.zLogin = 0;
518 if( robot_restrict(zName) ) return;
519 if( zZip && zZip[0] ){
520 rid = symbolic_name_to_rid(zZip, "ci");
521 if( rid && robot_restrict_zip(rid) ) return;
522 }
523 style_set_current_feature("test");
524 style_header("robot_restrict() test");
525 @ <h1>Captcha passed</h1>
526 @
527 @ <p>
@@ -438,10 +529,14 @@
529 @ proof=%h(zP1)<br>
530 }
531 if( zP2 && zP2[0] ){
532 @ %h(ROBOT_COOKIE)=%h(zP2)<br>
533 cgi_set_cookie(ROBOT_COOKIE,"",0,-1);
534 }
535 if( zZip && zZip[0] ){
536 @ zip=%h(zZip)<br>
537 @ rid=%d(rid)<br>
538 }
539 if( g.perm.Admin ){
540 z = db_get("robot-restrict",robot_restrict_default());
541 if( z && z[0] ){
542 @ robot-restrict=%h(z)</br>
543
+24 -16
--- src/setup.c
+++ src/setup.c
@@ -470,12 +470,12 @@
470470
@ <p>A Fossil website can have billions of pages in its tree, even for a
471471
@ modest project. Many of those pages (examples: diffs and tarballs)
472472
@ might be expensive to compute. A robot that tries to walk the entire
473473
@ website can present a crippling CPU and bandwidth load.
474474
@
475
- @ <p>The settings on this page are intended to help site administrators
476
- @ defend the site against robots.
475
+ @ <p>The settings on this page are intended to help administrators
476
+ @ defend against abusive robots.
477477
@
478478
@ <form action="%R/setup_robot" method="post"><div>
479479
login_insert_csrf_secret();
480480
@ <input type="submit" name="submit" value="Apply Changes"></p>
481481
@ <hr>
@@ -482,43 +482,51 @@
482482
@ <p><b>Do not allow robots access to these pages.</b><br>
483483
@ If the page name matches the GLOB pattern of this setting, and the
484484
@ users is "nobody", and the client has not previously passed a captcha
485485
@ test to show that it is not a robot, then the page is not displayed.
486486
@ A captcha test is is rendered instead.
487
- @ The recommended value for this setting is:
487
+ @ The default value for this setting is:
488488
@ <p>
489489
@ &emsp;&emsp;&emsp;<tt>%h(robot_restrict_default())</tt>
490490
@ <p>
491491
@ The "diff" tag covers all diffing pages such as /vdiff, /fdiff, and
492492
@ /vpatch. The "annotate" tag covers /annotate and also /blame and
493493
@ /praise. The "zip" covers itself and also /tarball and /sqlar. If a
494494
@ tag has an "X" character appended, then it only applies if query
495
- @ parameters are such that the page is particularly difficult to compute.
495
+ @ parameters are such that the page is expensive and/or unusual.
496496
@ In all other case, the tag should exactly match the page name.
497497
@
498498
@ To disable robot restrictions, change this setting to "off".
499499
@ (Property: robot-restrict)
500500
@ <br>
501501
textarea_attribute("", 2, 80,
502502
"robot-restrict", "rbrestrict", robot_restrict_default(), 0);
503503
504
- @ <hr>
505
- @ <p><b>Exceptions to anti-robot restrictions</b><br>
506
- @ The entry below is a list of
507
- @ <a href="%R/re_rules">regular expressions</a>, one per line.
508
- @ If any of these regular expressions match the input URL, then the
509
- @ request is exempt from anti-robot defenses. Use this, for example,
510
- @ to allow scripts to download release tarballs using a pattern
511
- @ like:</p>
512
- @ <p>
513
- @ &emsp;&emsp;<tt>^/tarball/(version-[0-9.]+|release)/</tt>
514
- @ <p>The pattern should match against the REQUEST_URI with the
504
+ @ <p><b>Exception #1</b><br>
505
+ @ If "zipX" appears in the robot-restrict list above, then tarballs,
506
+ @ ZIP-archives, and SQL-archives may be downloaded by robots if
507
+ @ the check-in is a leaf (robot-zip-leaf):<br>
508
+ onoff_attribute("Allow tarballs for leaf check-ins",
509
+ "robot-zip-leaf", "rzleaf", 0, 0);
510
+
511
+ @ <p><b>Exception #2</b><br>
512
+ @ If "zipX" appears in the robot-restrict list above, then tarballs,
513
+ @ ZIP-archives, and SQL-archives may be downloaded by robots if
514
+ @ the check-in has one or more tags that match the following
515
+ @ list of GLOB patterns: (robot-zip-tag)<br>
516
+ textarea_attribute("", 2, 80,
517
+ "robot-zip-tag", "rztag", "", 0);
518
+
519
+ @ <p><b>Exception #3</b><br>
520
+ @ If the request URI matches any of the following
521
+ @ <a href="%R/re_rules">regular expressions</a> (one per line), then the
522
+ @ request is exempt from anti-robot defenses.
523
+ @ The regular expression is matched against the REQUEST_URI with the
515524
@ SCRIPT_NAME prefix removed, and with QUERY_STRING appended following
516525
@ a "?" if QUERY_STRING exists. (Property: robot-exception)<br>
517526
textarea_attribute("", 3, 80,
518527
"robot-exception", "rbexcept", "", 0);
519
-
520528
@ <hr>
521529
addAutoHyperlinkSettings();
522530
523531
@ <hr>
524532
entry_attribute("Anonymous Login Validity", 11, "anon-cookie-lifespan",
525533
--- src/setup.c
+++ src/setup.c
@@ -470,12 +470,12 @@
470 @ <p>A Fossil website can have billions of pages in its tree, even for a
471 @ modest project. Many of those pages (examples: diffs and tarballs)
472 @ might be expensive to compute. A robot that tries to walk the entire
473 @ website can present a crippling CPU and bandwidth load.
474 @
475 @ <p>The settings on this page are intended to help site administrators
476 @ defend the site against robots.
477 @
478 @ <form action="%R/setup_robot" method="post"><div>
479 login_insert_csrf_secret();
480 @ <input type="submit" name="submit" value="Apply Changes"></p>
481 @ <hr>
@@ -482,43 +482,51 @@
482 @ <p><b>Do not allow robots access to these pages.</b><br>
483 @ If the page name matches the GLOB pattern of this setting, and the
484 @ users is "nobody", and the client has not previously passed a captcha
485 @ test to show that it is not a robot, then the page is not displayed.
486 @ A captcha test is is rendered instead.
487 @ The recommended value for this setting is:
488 @ <p>
489 @ &emsp;&emsp;&emsp;<tt>%h(robot_restrict_default())</tt>
490 @ <p>
491 @ The "diff" tag covers all diffing pages such as /vdiff, /fdiff, and
492 @ /vpatch. The "annotate" tag covers /annotate and also /blame and
493 @ /praise. The "zip" covers itself and also /tarball and /sqlar. If a
494 @ tag has an "X" character appended, then it only applies if query
495 @ parameters are such that the page is particularly difficult to compute.
496 @ In all other case, the tag should exactly match the page name.
497 @
498 @ To disable robot restrictions, change this setting to "off".
499 @ (Property: robot-restrict)
500 @ <br>
501 textarea_attribute("", 2, 80,
502 "robot-restrict", "rbrestrict", robot_restrict_default(), 0);
503
504 @ <hr>
505 @ <p><b>Exceptions to anti-robot restrictions</b><br>
506 @ The entry below is a list of
507 @ <a href="%R/re_rules">regular expressions</a>, one per line.
508 @ If any of these regular expressions match the input URL, then the
509 @ request is exempt from anti-robot defenses. Use this, for example,
510 @ to allow scripts to download release tarballs using a pattern
511 @ like:</p>
512 @ <p>
513 @ &emsp;&emsp;<tt>^/tarball/(version-[0-9.]+|release)/</tt>
514 @ <p>The pattern should match against the REQUEST_URI with the
 
 
 
 
 
 
 
 
 
515 @ SCRIPT_NAME prefix removed, and with QUERY_STRING appended following
516 @ a "?" if QUERY_STRING exists. (Property: robot-exception)<br>
517 textarea_attribute("", 3, 80,
518 "robot-exception", "rbexcept", "", 0);
519
520 @ <hr>
521 addAutoHyperlinkSettings();
522
523 @ <hr>
524 entry_attribute("Anonymous Login Validity", 11, "anon-cookie-lifespan",
525
--- src/setup.c
+++ src/setup.c
@@ -470,12 +470,12 @@
470 @ <p>A Fossil website can have billions of pages in its tree, even for a
471 @ modest project. Many of those pages (examples: diffs and tarballs)
472 @ might be expensive to compute. A robot that tries to walk the entire
473 @ website can present a crippling CPU and bandwidth load.
474 @
475 @ <p>The settings on this page are intended to help administrators
476 @ defend against abusive robots.
477 @
478 @ <form action="%R/setup_robot" method="post"><div>
479 login_insert_csrf_secret();
480 @ <input type="submit" name="submit" value="Apply Changes"></p>
481 @ <hr>
@@ -482,43 +482,51 @@
482 @ <p><b>Do not allow robots access to these pages.</b><br>
483 @ If the page name matches the GLOB pattern of this setting, and the
484 @ users is "nobody", and the client has not previously passed a captcha
485 @ test to show that it is not a robot, then the page is not displayed.
486 @ A captcha test is is rendered instead.
487 @ The default value for this setting is:
488 @ <p>
489 @ &emsp;&emsp;&emsp;<tt>%h(robot_restrict_default())</tt>
490 @ <p>
491 @ The "diff" tag covers all diffing pages such as /vdiff, /fdiff, and
492 @ /vpatch. The "annotate" tag covers /annotate and also /blame and
493 @ /praise. The "zip" covers itself and also /tarball and /sqlar. If a
494 @ tag has an "X" character appended, then it only applies if query
495 @ parameters are such that the page is expensive and/or unusual.
496 @ In all other case, the tag should exactly match the page name.
497 @
498 @ To disable robot restrictions, change this setting to "off".
499 @ (Property: robot-restrict)
500 @ <br>
501 textarea_attribute("", 2, 80,
502 "robot-restrict", "rbrestrict", robot_restrict_default(), 0);
503
504 @ <p><b>Exception #1</b><br>
505 @ If "zipX" appears in the robot-restrict list above, then tarballs,
506 @ ZIP-archives, and SQL-archives may be downloaded by robots if
507 @ the check-in is a leaf (robot-zip-leaf):<br>
508 onoff_attribute("Allow tarballs for leaf check-ins",
509 "robot-zip-leaf", "rzleaf", 0, 0);
510
511 @ <p><b>Exception #2</b><br>
512 @ If "zipX" appears in the robot-restrict list above, then tarballs,
513 @ ZIP-archives, and SQL-archives may be downloaded by robots if
514 @ the check-in has one or more tags that match the following
515 @ list of GLOB patterns: (robot-zip-tag)<br>
516 textarea_attribute("", 2, 80,
517 "robot-zip-tag", "rztag", "", 0);
518
519 @ <p><b>Exception #3</b><br>
520 @ If the request URI matches any of the following
521 @ <a href="%R/re_rules">regular expressions</a> (one per line), then the
522 @ request is exempt from anti-robot defenses.
523 @ The regular expression is matched against the REQUEST_URI with the
524 @ SCRIPT_NAME prefix removed, and with QUERY_STRING appended following
525 @ a "?" if QUERY_STRING exists. (Property: robot-exception)<br>
526 textarea_attribute("", 3, 80,
527 "robot-exception", "rbexcept", "", 0);
 
528 @ <hr>
529 addAutoHyperlinkSettings();
530
531 @ <hr>
532 entry_attribute("Anonymous Login Validity", 11, "anon-cookie-lifespan",
533
+16
--- src/tar.c
+++ src/tar.c
@@ -812,10 +812,25 @@
812812
**
813813
** ex=PATTERN Omit any file that match PATTERN. PATTERN is a
814814
** comma-separated list of GLOB patterns, where each
815815
** pattern can optionally be quoted using ".." or '..'.
816816
** Any file matching both ex= and in= is excluded.
817
+**
818
+** Robot Defenses:
819
+**
820
+** * If "zip" appears in the robot-restrict setting, then robots are
821
+** not allowed to access this page. Suspected robots will be
822
+** presented with a captcha.
823
+**
824
+** * If "zipX" appears in the robot-restrict setting, then robots are
825
+** restricted in the same way as with "zip", but with exceptions.
826
+** If the check-in for which an archive is requested is a leaf check-in
827
+** and if the robot-zip-leaf setting is true, then the request is
828
+** allowed. Or if the check-in has a tag that matches any of the
829
+** GLOB patterns on the list in the robot-zip-tag setting, then the
830
+** request is allowed. Otherwise, the usual robot defenses are
831
+** activated.
817832
*/
818833
void tarball_page(void){
819834
int rid;
820835
char *zName, *zRid, *zKey;
821836
int nName, nRid;
@@ -864,10 +879,11 @@
864879
if( rid==0 ){
865880
cgi_set_status(404, "Not Found");
866881
@ Not found
867882
return;
868883
}
884
+ if( robot_restrict_zip(rid) ) return;
869885
if( nRid==0 && nName>10 ) zName[10] = 0;
870886
871887
/* Compute a unique key for the cache entry based on query parameters */
872888
blob_init(&cacheKey, 0, 0);
873889
blob_appendf(&cacheKey, "/tarball/%z", rid_to_uuid(rid));
874890
--- src/tar.c
+++ src/tar.c
@@ -812,10 +812,25 @@
812 **
813 ** ex=PATTERN Omit any file that match PATTERN. PATTERN is a
814 ** comma-separated list of GLOB patterns, where each
815 ** pattern can optionally be quoted using ".." or '..'.
816 ** Any file matching both ex= and in= is excluded.
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
817 */
818 void tarball_page(void){
819 int rid;
820 char *zName, *zRid, *zKey;
821 int nName, nRid;
@@ -864,10 +879,11 @@
864 if( rid==0 ){
865 cgi_set_status(404, "Not Found");
866 @ Not found
867 return;
868 }
 
869 if( nRid==0 && nName>10 ) zName[10] = 0;
870
871 /* Compute a unique key for the cache entry based on query parameters */
872 blob_init(&cacheKey, 0, 0);
873 blob_appendf(&cacheKey, "/tarball/%z", rid_to_uuid(rid));
874
--- src/tar.c
+++ src/tar.c
@@ -812,10 +812,25 @@
812 **
813 ** ex=PATTERN Omit any file that match PATTERN. PATTERN is a
814 ** comma-separated list of GLOB patterns, where each
815 ** pattern can optionally be quoted using ".." or '..'.
816 ** Any file matching both ex= and in= is excluded.
817 **
818 ** Robot Defenses:
819 **
820 ** * If "zip" appears in the robot-restrict setting, then robots are
821 ** not allowed to access this page. Suspected robots will be
822 ** presented with a captcha.
823 **
824 ** * If "zipX" appears in the robot-restrict setting, then robots are
825 ** restricted in the same way as with "zip", but with exceptions.
826 ** If the check-in for which an archive is requested is a leaf check-in
827 ** and if the robot-zip-leaf setting is true, then the request is
828 ** allowed. Or if the check-in has a tag that matches any of the
829 ** GLOB patterns on the list in the robot-zip-tag setting, then the
830 ** request is allowed. Otherwise, the usual robot defenses are
831 ** activated.
832 */
833 void tarball_page(void){
834 int rid;
835 char *zName, *zRid, *zKey;
836 int nName, nRid;
@@ -864,10 +879,11 @@
879 if( rid==0 ){
880 cgi_set_status(404, "Not Found");
881 @ Not found
882 return;
883 }
884 if( robot_restrict_zip(rid) ) return;
885 if( nRid==0 && nName>10 ) zName[10] = 0;
886
887 /* Compute a unique key for the cache entry based on query parameters */
888 blob_init(&cacheKey, 0, 0);
889 blob_appendf(&cacheKey, "/tarball/%z", rid_to_uuid(rid));
890
+16
--- src/zip.c
+++ src/zip.c
@@ -995,10 +995,25 @@
995995
**
996996
** ex=PATTERN Omit any file that match PATTERN. PATTERN is a
997997
** comma-separated list of GLOB patterns, where each
998998
** pattern can optionally be quoted using ".." or '..'.
999999
** Any file matching both ex= and in= is excluded.
1000
+**
1001
+** Robot Defenses:
1002
+**
1003
+** * If "zip" appears in the robot-restrict setting, then robots are
1004
+** not allowed to access this page. Suspected robots will be
1005
+** presented with a captcha.
1006
+**
1007
+** * If "zipX" appears in the robot-restrict setting, then robots are
1008
+** restricted in the same way as with "zip", but with exceptions.
1009
+** If the check-in for which an archive is requested is a leaf check-in
1010
+** and if the robot-zip-leaf setting is true, then the request is
1011
+** allowed. Or if the check-in has a tag that matches any of the
1012
+** GLOB patterns on the list in the robot-zip-tag setting, then the
1013
+** request is allowed. Otherwise, the usual robot defenses are
1014
+** activated.
10001015
*/
10011016
void baseline_zip_page(void){
10021017
int rid;
10031018
const char *z;
10041019
char *zName, *zRid, *zKey;
@@ -1069,10 +1084,11 @@
10691084
if( rid<=0 ){
10701085
cgi_set_status(404, "Not Found");
10711086
@ Not found
10721087
return;
10731088
}
1089
+ if( robot_restrict_zip(rid) ) return;
10741090
if( nRid==0 && nName>10 ) zName[10] = 0;
10751091
10761092
/* Compute a unique key for the cache entry based on query parameters */
10771093
blob_init(&cacheKey, 0, 0);
10781094
blob_appendf(&cacheKey, "/%s/%z", g.zPath, rid_to_uuid(rid));
10791095
--- src/zip.c
+++ src/zip.c
@@ -995,10 +995,25 @@
995 **
996 ** ex=PATTERN Omit any file that match PATTERN. PATTERN is a
997 ** comma-separated list of GLOB patterns, where each
998 ** pattern can optionally be quoted using ".." or '..'.
999 ** Any file matching both ex= and in= is excluded.
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1000 */
1001 void baseline_zip_page(void){
1002 int rid;
1003 const char *z;
1004 char *zName, *zRid, *zKey;
@@ -1069,10 +1084,11 @@
1069 if( rid<=0 ){
1070 cgi_set_status(404, "Not Found");
1071 @ Not found
1072 return;
1073 }
 
1074 if( nRid==0 && nName>10 ) zName[10] = 0;
1075
1076 /* Compute a unique key for the cache entry based on query parameters */
1077 blob_init(&cacheKey, 0, 0);
1078 blob_appendf(&cacheKey, "/%s/%z", g.zPath, rid_to_uuid(rid));
1079
--- src/zip.c
+++ src/zip.c
@@ -995,10 +995,25 @@
995 **
996 ** ex=PATTERN Omit any file that match PATTERN. PATTERN is a
997 ** comma-separated list of GLOB patterns, where each
998 ** pattern can optionally be quoted using ".." or '..'.
999 ** Any file matching both ex= and in= is excluded.
1000 **
1001 ** Robot Defenses:
1002 **
1003 ** * If "zip" appears in the robot-restrict setting, then robots are
1004 ** not allowed to access this page. Suspected robots will be
1005 ** presented with a captcha.
1006 **
1007 ** * If "zipX" appears in the robot-restrict setting, then robots are
1008 ** restricted in the same way as with "zip", but with exceptions.
1009 ** If the check-in for which an archive is requested is a leaf check-in
1010 ** and if the robot-zip-leaf setting is true, then the request is
1011 ** allowed. Or if the check-in has a tag that matches any of the
1012 ** GLOB patterns on the list in the robot-zip-tag setting, then the
1013 ** request is allowed. Otherwise, the usual robot defenses are
1014 ** activated.
1015 */
1016 void baseline_zip_page(void){
1017 int rid;
1018 const char *z;
1019 char *zName, *zRid, *zKey;
@@ -1069,10 +1084,11 @@
1084 if( rid<=0 ){
1085 cgi_set_status(404, "Not Found");
1086 @ Not found
1087 return;
1088 }
1089 if( robot_restrict_zip(rid) ) return;
1090 if( nRid==0 && nName>10 ) zName[10] = 0;
1091
1092 /* Compute a unique key for the cache entry based on query parameters */
1093 blob_init(&cacheKey, 0, 0);
1094 blob_appendf(&cacheKey, "/%s/%z", g.zPath, rid_to_uuid(rid));
1095

Keyboard Shortcuts

Open search /
Next entry (timeline) j
Previous entry (timeline) k
Open focused entry Enter
Show this help ?
Toggle theme Top nav button