Fossil SCM

Add the robot-exception setting.

drh 2025-08-21 14:08 trunk

Commit 86b6ef7fe3424284a38f9706482c58b4e65bd39d2f57ee0bdeb52065440e37a6

Parent 171127fd14cf63b…

3 files changed +18 -5 +93 +17 -2

~ src/regexp.c ~ src/robot.c ~ src/setup.c

M src/regexp.c

+18 -5

		--- src/regexp.c
		+++ src/regexp.c
		@@ -850,25 +850,38 @@
850	850	** Run a regular expression match over the named disk files, or against
851	851	** standard input if no disk files are named on the command-line.
852	852	**
853	853	** Options:
854	854	** -i\|--ignore-case Ignore case
	855	+** --robot-exception Use the robot-exception setting as the REGEXP
855	856	*/
856	857	void re_test_grep(void){
857	858	ReCompiled *pRe;
858	859	const char *zErr;
	860	+ int iFileList = 3;
859	861	int ignoreCase = find_option("ignore-case","i",0)!=0;
860		- if( g.argc<3 ){
861		- usage("REGEXP [FILE...]");
	862	+ int bRobot = find_option("robot-exception",0,0)!=0;
	863	+ if( bRobot ){
	864	+ const char *zRe;
	865	+ db_find_and_open_repository(0,0);
	866	+ verify_all_options();
	867	+ zRe = db_get("robot-exception","^$");
	868	+ zErr = re_compile(&pRe, zRe, ignoreCase);
	869	+ iFileList = 2;
	870	+ }else{
	871	+ verify_all_options();
	872	+ if( g.argc<3 ){
	873	+ usage("REGEXP [FILE...]");
	874	+ }
	875	+ zErr = re_compile(&pRe, g.argv[2], ignoreCase);
862	876	}
863		- zErr = re_compile(&pRe, g.argv[2], ignoreCase);
864	877	if( zErr ) fossil_fatal("%s", zErr);
865		- if( g.argc==3 ){
	878	+ if( g.argc==iFileList ){
866	879	grep_file(pRe, "-", stdin);
867	880	}else{
868	881	int i;
869		- for(i=3; i<g.argc; i++){
	882	+ for(i=iFileList; i<g.argc; i++){
870	883	FILE *in = fossil_fopen(g.argv[i], "rb");
871	884	if( in==0 ){
872	885	fossil_warning("cannot open \"%s\"", g.argv[i]);
873	886	}else{
874	887	grep_file(pRe, g.argv[i], in);
875	888

	--- src/regexp.c
	+++ src/regexp.c
	@@ -850,25 +850,38 @@
850	** Run a regular expression match over the named disk files, or against
851	** standard input if no disk files are named on the command-line.
852	**
853	** Options:
854	** -i\|--ignore-case Ignore case

855	*/
856	void re_test_grep(void){
857	ReCompiled *pRe;
858	const char *zErr;

859	int ignoreCase = find_option("ignore-case","i",0)!=0;
860	if( g.argc<3 ){
861	usage("REGEXP [FILE...]");












862	}
863	zErr = re_compile(&pRe, g.argv[2], ignoreCase);
864	if( zErr ) fossil_fatal("%s", zErr);
865	if( g.argc==3 ){
866	grep_file(pRe, "-", stdin);
867	}else{
868	int i;
869	for(i=3; i<g.argc; i++){
870	FILE *in = fossil_fopen(g.argv[i], "rb");
871	if( in==0 ){
872	fossil_warning("cannot open \"%s\"", g.argv[i]);
873	}else{
874	grep_file(pRe, g.argv[i], in);
875

	--- src/regexp.c
	+++ src/regexp.c
	@@ -850,25 +850,38 @@
850	** Run a regular expression match over the named disk files, or against
851	** standard input if no disk files are named on the command-line.
852	**
853	** Options:
854	** -i\|--ignore-case Ignore case
855	** --robot-exception Use the robot-exception setting as the REGEXP
856	*/
857	void re_test_grep(void){
858	ReCompiled *pRe;
859	const char *zErr;
860	int iFileList = 3;
861	int ignoreCase = find_option("ignore-case","i",0)!=0;
862	int bRobot = find_option("robot-exception",0,0)!=0;
863	if( bRobot ){
864	const char *zRe;
865	db_find_and_open_repository(0,0);
866	verify_all_options();
867	zRe = db_get("robot-exception","^$");
868	zErr = re_compile(&pRe, zRe, ignoreCase);
869	iFileList = 2;
870	}else{
871	verify_all_options();
872	if( g.argc<3 ){
873	usage("REGEXP [FILE...]");
874	}
875	zErr = re_compile(&pRe, g.argv[2], ignoreCase);
876	}

877	if( zErr ) fossil_fatal("%s", zErr);
878	if( g.argc==iFileList ){
879	grep_file(pRe, "-", stdin);
880	}else{
881	int i;
882	for(i=iFileList; i<g.argc; i++){
883	FILE *in = fossil_fopen(g.argv[i], "rb");
884	if( in==0 ){
885	fossil_warning("cannot open \"%s\"", g.argv[i]);
886	}else{
887	grep_file(pRe, g.argv[i], in);
888

M src/robot.c

+93

		--- src/robot.c
		+++ src/robot.c
		@@ -264,10 +264,28 @@
264	264	** particularly difficult to compute. In all other case, the tag should
265	265	** exactly match the page name.
266	266	**
267	267	** Change this setting "off" to disable all robot restrictions.
268	268	*/
	269	+/*
	270	+** SETTING: robot-exception width=40 block-text
	271	+**
	272	+** The value of this setting should be a regular expression.
	273	+** If it matches the REQUEST_URI without the SCRIPT_NAME prefix
	274	+** matches this regular expression, then the request is an exception
	275	+** to anti-robot defenses and should be allowed through. For
	276	+** example, to allow robots to download tarballs or ZIP archives
	277	+** for named versions and releases, you could use an expression like
	278	+** this:
	279	+**
	280	+** ^/(tarball\|zip)\\b*\\b(version-\|release)\\b
	281	+**
	282	+** This setting can hold multiple regular expressions, one
	283	+** regular expression per line. The input URL is exempted from
	284	+** anti-robot defenses if any of the multiple regular expressions
	285	+** matches.
	286	+*/
269	287
270	288	/*
271	289	** Return the default restriction GLOB
272	290	*/
273	291	const char *robot_restrict_default(void){
		@@ -287,10 +305,81 @@
287	305	if( zGlob[0]==0 \|\| fossil_strcmp(zGlob, "off")==0 ){
288	306	return 0;
289	307	}
290	308	return glob_multi_match(zGlob,zTag);
291	309	}
	310	+
	311	+/*
	312	+** Check the request URI to see if it matches one of the URI
	313	+** exceptions listed in the robot-exception setting. Return true
	314	+** if it does. Return false if it does not.
	315	+**
	316	+** For the purposes of this routine, the "request URI" means
	317	+** the REQUEST_URI value with the SCRIPT_NAME prefix removed and
	318	+** with QUERY_STRING appended with a "?" separator if QUERY_STRING
	319	+** is not empty.
	320	+**
	321	+** If the robot-exception setting does not exist or is an empty
	322	+** string, then return false.
	323	+*/
	324	+int robot_exception(void){
	325	+ const char *zRE = db_get("robot-exception",0);
	326	+ const char zQS; / QUERY_STRING */
	327	+ const char zURI; / REQUEST_URI */
	328	+ const char zSN; / SCRIPT_NAME */
	329	+ const char zNL; / Next newline character */
	330	+ char zRequest; / REQUEST_URL w/o SCRIPT_NAME prefix + QUERY_STRING */
	331	+ int nRequest; /* Length of zRequest in bytes */
	332	+ size_t nURI, nSN; /* Length of zURI and zSN */
	333	+ int bMatch = 0; /* True if there is a match */
	334	+
	335	+ if( zRE==0 ) return 0;
	336	+ if( zRE[0]==0 ) return 0;
	337	+ zURI = PD("REQUEST_URI","");
	338	+ nURI = strlen(zURI);
	339	+ zSN = PD("SCRIPT_NAME","");
	340	+ nSN = strlen(zSN);
	341	+ if( nSN<=nURI ) zURI += nSN;
	342	+ zQS = P("QUERY_STRING");
	343	+ if( zQS && zQS[0] ){
	344	+ zRequest = mprintf("%s?%s", zURI, zQS);
	345	+ }else{
	346	+ zRequest = fossil_strdup(zURI);
	347	+ }
	348	+ nRequest = (int)strlen(zRequest);
	349	+ while( zRE[0] && bMatch==0 ){
	350	+ char *z;
	351	+ const char *zErr;
	352	+ size_t n;
	353	+ ReCompiled *pRe;
	354	+ zNL = strchr(zRE,'\n');
	355	+ if( zNL ){
	356	+ n = (size_t)(zNL - zRE)+1;
	357	+ while( zNL>zRE && fossil_isspace(zNL[0]) ) zNL--;
	358	+ if( zNL==zRE ){
	359	+ zRE += n;
	360	+ continue;
	361	+ }
	362	+ }else{
	363	+ n = strlen(zRE);
	364	+ }
	365	+ z = mprintf("%.*s", (int)(zNL - zRE)+1, zRE);
	366	+ zRE += n;
	367	+ zErr = re_compile(&pRe, z, 0);
	368	+ if( zErr ){
	369	+ fossil_warning("robot-exception error \"%s\" in expression \"%s\"\n",
	370	+ zErr, z);
	371	+ fossil_free(z);
	372	+ continue;
	373	+ }
	374	+ fossil_free(z);
	375	+ bMatch = re_match(pRe, (const unsigned char*)zRequest, nRequest);
	376	+ re_free(pRe);
	377	+ }
	378	+ fossil_free(zRequest);
	379	+ return bMatch;
	380	+}
292	381
293	382	/*
294	383	** Check to see if the page named in the argument is on the
295	384	** robot-restrict list. If it is on the list and if the user
296	385	** is "nobody" then bring up a captcha to test to make sure that
		@@ -302,10 +391,14 @@
302	391	*/
303	392	int robot_restrict(const char *zTag){
304	393	if( robot.resultCache==KNOWN_NOT_ROBOT ) return 0;
305	394	if( !robot_restrict_has_tag(zTag) ) return 0;
306	395	if( !client_might_be_a_robot() ) return 0;
	396	+ if( robot_exception() ){
	397	+ robot.resultCache = KNOWN_NOT_ROBOT;
	398	+ return 0;
	399	+ }
307	400
308	401	/* Generate the proof-of-work captcha */
309	402	ask_for_proof_that_client_is_not_robot();
310	403	return 1;
311	404	}
312	405

	--- src/robot.c
	+++ src/robot.c
	@@ -264,10 +264,28 @@
264	** particularly difficult to compute. In all other case, the tag should
265	** exactly match the page name.
266	**
267	** Change this setting "off" to disable all robot restrictions.
268	*/


















269
270	/*
271	** Return the default restriction GLOB
272	*/
273	const char *robot_restrict_default(void){
	@@ -287,10 +305,81 @@
287	if( zGlob[0]==0 \|\| fossil_strcmp(zGlob, "off")==0 ){
288	return 0;
289	}
290	return glob_multi_match(zGlob,zTag);
291	}







































































292
293	/*
294	** Check to see if the page named in the argument is on the
295	** robot-restrict list. If it is on the list and if the user
296	** is "nobody" then bring up a captcha to test to make sure that
	@@ -302,10 +391,14 @@
302	*/
303	int robot_restrict(const char *zTag){
304	if( robot.resultCache==KNOWN_NOT_ROBOT ) return 0;
305	if( !robot_restrict_has_tag(zTag) ) return 0;
306	if( !client_might_be_a_robot() ) return 0;




307
308	/* Generate the proof-of-work captcha */
309	ask_for_proof_that_client_is_not_robot();
310	return 1;
311	}
312

	--- src/robot.c
	+++ src/robot.c
	@@ -264,10 +264,28 @@
264	** particularly difficult to compute. In all other case, the tag should
265	** exactly match the page name.
266	**
267	** Change this setting "off" to disable all robot restrictions.
268	*/
269	/*
270	** SETTING: robot-exception width=40 block-text
271	**
272	** The value of this setting should be a regular expression.
273	** If it matches the REQUEST_URI without the SCRIPT_NAME prefix
274	** matches this regular expression, then the request is an exception
275	** to anti-robot defenses and should be allowed through. For
276	** example, to allow robots to download tarballs or ZIP archives
277	** for named versions and releases, you could use an expression like
278	** this:
279	**
280	** ^/(tarball\|zip)\\b*\\b(version-\|release)\\b
281	**
282	** This setting can hold multiple regular expressions, one
283	** regular expression per line. The input URL is exempted from
284	** anti-robot defenses if any of the multiple regular expressions
285	** matches.
286	*/
287
288	/*
289	** Return the default restriction GLOB
290	*/
291	const char *robot_restrict_default(void){
	@@ -287,10 +305,81 @@
305	if( zGlob[0]==0 \|\| fossil_strcmp(zGlob, "off")==0 ){
306	return 0;
307	}
308	return glob_multi_match(zGlob,zTag);
309	}
310
311	/*
312	** Check the request URI to see if it matches one of the URI
313	** exceptions listed in the robot-exception setting. Return true
314	** if it does. Return false if it does not.
315	**
316	** For the purposes of this routine, the "request URI" means
317	** the REQUEST_URI value with the SCRIPT_NAME prefix removed and
318	** with QUERY_STRING appended with a "?" separator if QUERY_STRING
319	** is not empty.
320	**
321	** If the robot-exception setting does not exist or is an empty
322	** string, then return false.
323	*/
324	int robot_exception(void){
325	const char *zRE = db_get("robot-exception",0);
326	const char zQS; / QUERY_STRING */
327	const char zURI; / REQUEST_URI */
328	const char zSN; / SCRIPT_NAME */
329	const char zNL; / Next newline character */
330	char zRequest; / REQUEST_URL w/o SCRIPT_NAME prefix + QUERY_STRING */
331	int nRequest; /* Length of zRequest in bytes */
332	size_t nURI, nSN; /* Length of zURI and zSN */
333	int bMatch = 0; /* True if there is a match */
334
335	if( zRE==0 ) return 0;
336	if( zRE[0]==0 ) return 0;
337	zURI = PD("REQUEST_URI","");
338	nURI = strlen(zURI);
339	zSN = PD("SCRIPT_NAME","");
340	nSN = strlen(zSN);
341	if( nSN<=nURI ) zURI += nSN;
342	zQS = P("QUERY_STRING");
343	if( zQS && zQS[0] ){
344	zRequest = mprintf("%s?%s", zURI, zQS);
345	}else{
346	zRequest = fossil_strdup(zURI);
347	}
348	nRequest = (int)strlen(zRequest);
349	while( zRE[0] && bMatch==0 ){
350	char *z;
351	const char *zErr;
352	size_t n;
353	ReCompiled *pRe;
354	zNL = strchr(zRE,'\n');
355	if( zNL ){
356	n = (size_t)(zNL - zRE)+1;
357	while( zNL>zRE && fossil_isspace(zNL[0]) ) zNL--;
358	if( zNL==zRE ){
359	zRE += n;
360	continue;
361	}
362	}else{
363	n = strlen(zRE);
364	}
365	z = mprintf("%.*s", (int)(zNL - zRE)+1, zRE);
366	zRE += n;
367	zErr = re_compile(&pRe, z, 0);
368	if( zErr ){
369	fossil_warning("robot-exception error \"%s\" in expression \"%s\"\n",
370	zErr, z);
371	fossil_free(z);
372	continue;
373	}
374	fossil_free(z);
375	bMatch = re_match(pRe, (const unsigned char*)zRequest, nRequest);
376	re_free(pRe);
377	}
378	fossil_free(zRequest);
379	return bMatch;
380	}
381
382	/*
383	** Check to see if the page named in the argument is on the
384	** robot-restrict list. If it is on the list and if the user
385	** is "nobody" then bring up a captcha to test to make sure that
	@@ -302,10 +391,14 @@
391	*/
392	int robot_restrict(const char *zTag){
393	if( robot.resultCache==KNOWN_NOT_ROBOT ) return 0;
394	if( !robot_restrict_has_tag(zTag) ) return 0;
395	if( !client_might_be_a_robot() ) return 0;
396	if( robot_exception() ){
397	robot.resultCache = KNOWN_NOT_ROBOT;
398	return 0;
399	}
400
401	/* Generate the proof-of-work captcha */
402	ask_for_proof_that_client_is_not_robot();
403	return 1;
404	}
405

M src/setup.c

+17 -2

		--- src/setup.c
		+++ src/setup.c
		@@ -477,12 +477,12 @@
477	477	@
478	478	@ <form action="%R/setup_robot" method="post"><div>
479	479	login_insert_csrf_secret();
480	480	@ <input type="submit" name="submit" value="Apply Changes"></p>
481	481	@ <hr>
482		- @ <p><b>Do not allow robots access to these pages.</b>
483		- @ <p> If the page name matches the GLOB pattern of this setting, and the
	482	+ @ <p><b>Do not allow robots access to these pages.</b><br>
	483	+ @ If the page name matches the GLOB pattern of this setting, and the
484	484	@ users is "nobody", and the client has not previously passed a captcha
485	485	@ test to show that it is not a robot, then the page is not displayed.
486	486	@ A captcha test is is rendered instead.
487	487	@ The recommended value for this setting is:
488	488	@ <p>
		@@ -499,10 +499,25 @@
499	499	@ (Property: robot-restrict)
500	500	@ <br>
501	501	textarea_attribute("", 2, 80,
502	502	"robot-restrict", "rbrestrict", robot_restrict_default(), 0);
503	503
	504	+ @ <hr>
	505	+ @ <p><b>Exceptions to anti-robot restrictions</b><br>
	506	+ @ The entry below is a list of regular expressions, one per line.
	507	+ @ If any of these regular expressions match the input URL, then the
	508	+ @ request is exempt from anti-robot defenses. Use this, for example,
	509	+ @ to allow scripts to download release tarballs using a pattern
	510	+ @ like:</p>
	511	+ @ <p>
	512	+ @ &emsp;&emsp;<tt>^/tarball\\b*\\b(version-\|release)\\b</tt>
	513	+ @ <p>The pattern should match against the REQUEST_URI with the
	514	+ @ SCRIPT_NAME prefix removed, and with QUERY_STRING appended following
	515	+ @ a "?" if QUERY_STRING exists. (Property: robot-exception)<br>
	516	+ textarea_attribute("", 3, 80,
	517	+ "robot-exception", "rbexcept", "", 0);
	518	+
504	519	@ <hr>
505	520	addAutoHyperlinkSettings();
506	521
507	522	@ <hr>
508	523	entry_attribute("Anonymous Login Validity", 11, "anon-cookie-lifespan",
509	524

	--- src/setup.c
	+++ src/setup.c
	@@ -477,12 +477,12 @@
477	@
478	@ <form action="%R/setup_robot" method="post"><div>
479	login_insert_csrf_secret();
480	@ <input type="submit" name="submit" value="Apply Changes"></p>
481	@ <hr>
482	@ <p><b>Do not allow robots access to these pages.</b>
483	@ <p> If the page name matches the GLOB pattern of this setting, and the
484	@ users is "nobody", and the client has not previously passed a captcha
485	@ test to show that it is not a robot, then the page is not displayed.
486	@ A captcha test is is rendered instead.
487	@ The recommended value for this setting is:
488	@ <p>
	@@ -499,10 +499,25 @@
499	@ (Property: robot-restrict)
500	@ <br>
501	textarea_attribute("", 2, 80,
502	"robot-restrict", "rbrestrict", robot_restrict_default(), 0);
503















504	@ <hr>
505	addAutoHyperlinkSettings();
506
507	@ <hr>
508	entry_attribute("Anonymous Login Validity", 11, "anon-cookie-lifespan",
509

	--- src/setup.c
	+++ src/setup.c
	@@ -477,12 +477,12 @@
477	@
478	@ <form action="%R/setup_robot" method="post"><div>
479	login_insert_csrf_secret();
480	@ <input type="submit" name="submit" value="Apply Changes"></p>
481	@ <hr>
482	@ <p><b>Do not allow robots access to these pages.</b><br>
483	@ If the page name matches the GLOB pattern of this setting, and the
484	@ users is "nobody", and the client has not previously passed a captcha
485	@ test to show that it is not a robot, then the page is not displayed.
486	@ A captcha test is is rendered instead.
487	@ The recommended value for this setting is:
488	@ <p>
	@@ -499,10 +499,25 @@
499	@ (Property: robot-restrict)
500	@ <br>
501	textarea_attribute("", 2, 80,
502	"robot-restrict", "rbrestrict", robot_restrict_default(), 0);
503
504	@ <hr>
505	@ <p><b>Exceptions to anti-robot restrictions</b><br>
506	@ The entry below is a list of regular expressions, one per line.
507	@ If any of these regular expressions match the input URL, then the
508	@ request is exempt from anti-robot defenses. Use this, for example,
509	@ to allow scripts to download release tarballs using a pattern
510	@ like:</p>
511	@ <p>
512	@ &emsp;&emsp;<tt>^/tarball\\b*\\b(version-\|release)\\b</tt>
513	@ <p>The pattern should match against the REQUEST_URI with the
514	@ SCRIPT_NAME prefix removed, and with QUERY_STRING appended following
515	@ a "?" if QUERY_STRING exists. (Property: robot-exception)<br>
516	textarea_attribute("", 3, 80,
517	"robot-exception", "rbexcept", "", 0);
518
519	@ <hr>
520	addAutoHyperlinkSettings();
521
522	@ <hr>
523	entry_attribute("Anonymous Login Validity", 11, "anon-cookie-lifespan",
524

Fossil SCM

Keyboard Shortcuts