Fossil SCM

From the changes.wiki for Fossil 1.25: "Disallow invalid UTF8 characters (such as characters in the surrogate pair range) in filenames." This completes the set of UTF8 characters which are generally considered invalid, so they should be disallowed in filenames: the "overlong form", invalid continuation bytes, and -finally- noncharacters.

jan.nijtmans 2013-01-21 09:39 UTC trunk

Commit 011d5f692d3dc5535bf1a24567c29b8b4dd45e15

Parent 1d462a683f082c6…

1 file changed +49 -28

~ src/file.c

M src/file.c

+49 -28

		--- src/file.c
		+++ src/file.c
		@@ -490,44 +490,65 @@
490	490	** * Does not contain two or more "/" characters in a row.
491	491	** * Contains at least one character
492	492	**
493	493	** Invalid UTF8 characters result in a false return if bStrictUtf8 is
494	494	** true. If bStrictUtf8 is false, invalid UTF8 characters are silently
495		-** ignored.
	495	+** ignored. See http://en.wikipedia.org/wiki/UTF-8#Invalid_byte_sequences
	496	+** and http://en.wikipedia.org/wiki/Unicode (for the noncharacters)
496	497	*/
497	498	int file_is_simple_pathname(const char *z, int bStrictUtf8){
498	499	int i;
499		- char c = z[0];
	500	+ unsigned char c = (unsigned char) z[0];
500	501	char maskNonAscii = bStrictUtf8 ? 0x80 : 0x00;
501	502	if( c=='/' \|\| c==0 ) return 0;
502	503	if( c=='.' ){
503	504	if( z[1]=='/' \|\| z[1]==0 ) return 0;
504	505	if( z[1]=='.' && (z[2]=='/' \|\| z[2]==0) ) return 0;
505	506	}
506		- for(i=0; (c=z[i])!=0; i++){
507		- if( c & maskNonAscii ){
508		- if( (c & 0xf0) == 0xf0 ) {
509		- /* Unicode characters > U+FFFF are not supported.
510		- * Windows XP and earlier cannot handle them.
511		- */
512		- return 0;
513		- }
514		- if( (c & 0xf0) == 0xe0 ) {
515		- /* This is a 3-byte UTF-8 character */
516		- if ( (c & 0xfe) == 0xee ){
517		- /* Range U+E000 - U+FFFF (Starting with 0xee or 0xef in UTF-8 ) */
518		- if ( !(c & 1) \|\| ((z[i+1] & 0xff) < 0xa4) ){
519		- /* Unicode character in the range U+E000 - U+F8FF are for
520		- * private use, they shouldn't occur in filenames. */
521		- return 0;
522		- }
523		- }else if( ((c & 0xff) == 0xed) && ((z[i+1] & 0xe0) == 0xa0) ){
524		- /* Unicode character in the range U+D800 - U+DFFF are for
525		- * surrogate pairs, they shouldn't occur in filenames. */
526		- return 0;
527		- }
528		- }
	507	+ for(i=0; (c=(unsigned char)z[i])!=0; i++){
	508	+ if( c & maskNonAscii ){
	509	+ if( c<0xc2 ){
	510	+ /* Invalid 1-byte UTF-8 sequence, or 2-byte overlong form. */
	511	+ return 0;
	512	+ }else if( (c&0xe0)==0xe0 ){
	513	+ /* 3-byte or more */
	514	+ int unicode;
	515	+ if( c&0x10 ){
	516	+ /* Unicode characters > U+FFFF are not supported.
	517	+ * Windows XP and earlier cannot handle them.
	518	+ */
	519	+ return 0;
	520	+ }
	521	+ /* This is a 3-byte UTF-8 character */
	522	+ unicode = ((c&0x0f)<<12) + ((c&0x3f)<<6) + (c&0x3f);
	523	+ if( unicode <= 0x07ff ){
	524	+ /* overlong form */
	525	+ return 0;
	526	+ }else if( unicode>=0xe000 ){
	527	+ /* U+E000..U+FFFF */
	528	+ if( (unicode<=0xf8ff) \|\| (unicode>=0xfffe) ){
	529	+ /* U+E000..U+F8FF are for private use.
	530	+ * U+FFFE..U+FFFF are noncharacters. */
	531	+ return 0;
	532	+ } else if( (unicode>=0xfdd0) && (unicode<=0xfdef) ){
	533	+ /* U+FDD0..U+FDEF are noncharacters. */
	534	+ return 0;
	535	+ }
	536	+ }else if( (unicode>=0xD800) && (unicode<=0xDFFF) ){
	537	+ /* U+D800..U+DFFF are for surrogate pairs. */
	538	+ return 0;
	539	+ }
	540	+ }
	541	+ do{
	542	+ if( (z[1]&0xc0)!=0x80 ){
	543	+ /* Invalid continuation byte (multi-byte UTF-8) */
	544	+ return 0;
	545	+ }
	546	+ /* The hi-bits of c are used to keep track of the number of expected
	547	+ * continuation-bytes, so we don't need a separate counter. */
	548	+ c<<=1; ++z;
	549	+ }while( c>=0xc0 );
529	550	}else if( c=='\\' ){
530	551	return 0;
531	552	}
532	553	if( c=='/' ){
533	554	if( z[i+1]=='/' ) return 0;
		@@ -578,11 +599,11 @@
578	599	if( z[i]=='\\' ) z[i] = '/';
579	600	}
580	601	#endif
581	602
582	603	/* Removing trailing "/" characters */
583		- if ( !slash ){
	604	+ if( !slash ){
584	605	while( n>1 && z[n-1]=='/' ){ n--; }
585	606	}
586	607
587	608	/* Remove duplicate '/' characters. Except, two // at the beginning
588	609	** of a pathname is allowed since this is important on windows. */
		@@ -835,11 +856,11 @@
835	856	if( zPwd[i]==0 ){
836	857	blob_append(pOut, ".", 1);
837	858	}else{
838	859	blob_append(pOut, "..", 2);
839	860	for(j=i+1; zPwd[j]; j++){
840		- if( zPwd[j]=='/' ) {
	861	+ if( zPwd[j]=='/' ){
841	862	blob_append(pOut, "/..", 3);
842	863	}
843	864	}
844	865	}
845	866	return;
		@@ -852,11 +873,11 @@
852	873	return;
853	874	}
854	875	while( zPath[i-1]!='/' ){ i--; }
855	876	blob_set(&tmp, "../");
856	877	for(j=i; zPwd[j]; j++){
857		- if( zPwd[j]=='/' ) {
	878	+ if( zPwd[j]=='/' ){
858	879	blob_append(&tmp, "../", 3);
859	880	}
860	881	}
861	882	blob_append(&tmp, &zPath[i], -1);
862	883	blob_reset(pOut);
863	884

	--- src/file.c
	+++ src/file.c
	@@ -490,44 +490,65 @@
490	** * Does not contain two or more "/" characters in a row.
491	** * Contains at least one character
492	**
493	** Invalid UTF8 characters result in a false return if bStrictUtf8 is
494	** true. If bStrictUtf8 is false, invalid UTF8 characters are silently
495	** ignored.

496	*/
497	int file_is_simple_pathname(const char *z, int bStrictUtf8){
498	int i;
499	char c = z[0];
500	char maskNonAscii = bStrictUtf8 ? 0x80 : 0x00;
501	if( c=='/' \|\| c==0 ) return 0;
502	if( c=='.' ){
503	if( z[1]=='/' \|\| z[1]==0 ) return 0;
504	if( z[1]=='.' && (z[2]=='/' \|\| z[2]==0) ) return 0;
505	}
506	for(i=0; (c=z[i])!=0; i++){
507	if( c & maskNonAscii ){
508	if( (c & 0xf0) == 0xf0 ) {
509	/* Unicode characters > U+FFFF are not supported.
510	* Windows XP and earlier cannot handle them.
511	*/
512	return 0;
513	}
514	if( (c & 0xf0) == 0xe0 ) {
515	/* This is a 3-byte UTF-8 character */
516	if ( (c & 0xfe) == 0xee ){
517	/* Range U+E000 - U+FFFF (Starting with 0xee or 0xef in UTF-8 ) */
518	if ( !(c & 1) \|\| ((z[i+1] & 0xff) < 0xa4) ){
519	/* Unicode character in the range U+E000 - U+F8FF are for
520	* private use, they shouldn't occur in filenames. */
521	return 0;
522	}
523	}else if( ((c & 0xff) == 0xed) && ((z[i+1] & 0xe0) == 0xa0) ){
524	/* Unicode character in the range U+D800 - U+DFFF are for
525	* surrogate pairs, they shouldn't occur in filenames. */
526	return 0;
527	}
528	}




















529	}else if( c=='\\' ){
530	return 0;
531	}
532	if( c=='/' ){
533	if( z[i+1]=='/' ) return 0;
	@@ -578,11 +599,11 @@
578	if( z[i]=='\\' ) z[i] = '/';
579	}
580	#endif
581
582	/* Removing trailing "/" characters */
583	if ( !slash ){
584	while( n>1 && z[n-1]=='/' ){ n--; }
585	}
586
587	/* Remove duplicate '/' characters. Except, two // at the beginning
588	** of a pathname is allowed since this is important on windows. */
	@@ -835,11 +856,11 @@
835	if( zPwd[i]==0 ){
836	blob_append(pOut, ".", 1);
837	}else{
838	blob_append(pOut, "..", 2);
839	for(j=i+1; zPwd[j]; j++){
840	if( zPwd[j]=='/' ) {
841	blob_append(pOut, "/..", 3);
842	}
843	}
844	}
845	return;
	@@ -852,11 +873,11 @@
852	return;
853	}
854	while( zPath[i-1]!='/' ){ i--; }
855	blob_set(&tmp, "../");
856	for(j=i; zPwd[j]; j++){
857	if( zPwd[j]=='/' ) {
858	blob_append(&tmp, "../", 3);
859	}
860	}
861	blob_append(&tmp, &zPath[i], -1);
862	blob_reset(pOut);
863

	--- src/file.c
	+++ src/file.c
	@@ -490,44 +490,65 @@
490	** * Does not contain two or more "/" characters in a row.
491	** * Contains at least one character
492	**
493	** Invalid UTF8 characters result in a false return if bStrictUtf8 is
494	** true. If bStrictUtf8 is false, invalid UTF8 characters are silently
495	** ignored. See http://en.wikipedia.org/wiki/UTF-8#Invalid_byte_sequences
496	** and http://en.wikipedia.org/wiki/Unicode (for the noncharacters)
497	*/
498	int file_is_simple_pathname(const char *z, int bStrictUtf8){
499	int i;
500	unsigned char c = (unsigned char) z[0];
501	char maskNonAscii = bStrictUtf8 ? 0x80 : 0x00;
502	if( c=='/' \|\| c==0 ) return 0;
503	if( c=='.' ){
504	if( z[1]=='/' \|\| z[1]==0 ) return 0;
505	if( z[1]=='.' && (z[2]=='/' \|\| z[2]==0) ) return 0;
506	}
507	for(i=0; (c=(unsigned char)z[i])!=0; i++){
508	if( c & maskNonAscii ){
509	if( c<0xc2 ){
510	/* Invalid 1-byte UTF-8 sequence, or 2-byte overlong form. */
511	return 0;
512	}else if( (c&0xe0)==0xe0 ){
513	/* 3-byte or more */
514	int unicode;
515	if( c&0x10 ){
516	/* Unicode characters > U+FFFF are not supported.
517	* Windows XP and earlier cannot handle them.
518	*/
519	return 0;
520	}
521	/* This is a 3-byte UTF-8 character */
522	unicode = ((c&0x0f)<<12) + ((c&0x3f)<<6) + (c&0x3f);
523	if( unicode <= 0x07ff ){
524	/* overlong form */
525	return 0;
526	}else if( unicode>=0xe000 ){
527	/* U+E000..U+FFFF */
528	if( (unicode<=0xf8ff) \|\| (unicode>=0xfffe) ){
529	/* U+E000..U+F8FF are for private use.
530	* U+FFFE..U+FFFF are noncharacters. */
531	return 0;
532	} else if( (unicode>=0xfdd0) && (unicode<=0xfdef) ){
533	/* U+FDD0..U+FDEF are noncharacters. */
534	return 0;
535	}
536	}else if( (unicode>=0xD800) && (unicode<=0xDFFF) ){
537	/* U+D800..U+DFFF are for surrogate pairs. */
538	return 0;
539	}
540	}
541	do{
542	if( (z[1]&0xc0)!=0x80 ){
543	/* Invalid continuation byte (multi-byte UTF-8) */
544	return 0;
545	}
546	/* The hi-bits of c are used to keep track of the number of expected
547	* continuation-bytes, so we don't need a separate counter. */
548	c<<=1; ++z;
549	}while( c>=0xc0 );
550	}else if( c=='\\' ){
551	return 0;
552	}
553	if( c=='/' ){
554	if( z[i+1]=='/' ) return 0;
	@@ -578,11 +599,11 @@
599	if( z[i]=='\\' ) z[i] = '/';
600	}
601	#endif
602
603	/* Removing trailing "/" characters */
604	if( !slash ){
605	while( n>1 && z[n-1]=='/' ){ n--; }
606	}
607
608	/* Remove duplicate '/' characters. Except, two // at the beginning
609	** of a pathname is allowed since this is important on windows. */
	@@ -835,11 +856,11 @@
856	if( zPwd[i]==0 ){
857	blob_append(pOut, ".", 1);
858	}else{
859	blob_append(pOut, "..", 2);
860	for(j=i+1; zPwd[j]; j++){
861	if( zPwd[j]=='/' ){
862	blob_append(pOut, "/..", 3);
863	}
864	}
865	}
866	return;
	@@ -852,11 +873,11 @@
873	return;
874	}
875	while( zPath[i-1]!='/' ){ i--; }
876	blob_set(&tmp, "../");
877	for(j=i; zPwd[j]; j++){
878	if( zPwd[j]=='/' ){
879	blob_append(&tmp, "../", 3);
880	}
881	}
882	blob_append(&tmp, &zPath[i], -1);
883	blob_reset(pOut);
884

Fossil SCM

Keyboard Shortcuts