Fossil SCM

From the changes.wiki for Fossil 1.25: "Disallow invalid UTF8 characters (such as characters in the surrogate pair range) in filenames." This completes the set of UTF8 characters which are generally considered invalid, so they should be disallowed in filenames: the "overlong form", invalid continuation bytes, and -finally- noncharacters.

jan.nijtmans 2013-01-21 09:39 UTC trunk
Commit 011d5f692d3dc5535bf1a24567c29b8b4dd45e15
1 file changed +49 -28
+49 -28
--- src/file.c
+++ src/file.c
@@ -490,44 +490,65 @@
490490
** * Does not contain two or more "/" characters in a row.
491491
** * Contains at least one character
492492
**
493493
** Invalid UTF8 characters result in a false return if bStrictUtf8 is
494494
** true. If bStrictUtf8 is false, invalid UTF8 characters are silently
495
-** ignored.
495
+** ignored. See http://en.wikipedia.org/wiki/UTF-8#Invalid_byte_sequences
496
+** and http://en.wikipedia.org/wiki/Unicode (for the noncharacters)
496497
*/
497498
int file_is_simple_pathname(const char *z, int bStrictUtf8){
498499
int i;
499
- char c = z[0];
500
+ unsigned char c = (unsigned char) z[0];
500501
char maskNonAscii = bStrictUtf8 ? 0x80 : 0x00;
501502
if( c=='/' || c==0 ) return 0;
502503
if( c=='.' ){
503504
if( z[1]=='/' || z[1]==0 ) return 0;
504505
if( z[1]=='.' && (z[2]=='/' || z[2]==0) ) return 0;
505506
}
506
- for(i=0; (c=z[i])!=0; i++){
507
- if( c & maskNonAscii ){
508
- if( (c & 0xf0) == 0xf0 ) {
509
- /* Unicode characters > U+FFFF are not supported.
510
- * Windows XP and earlier cannot handle them.
511
- */
512
- return 0;
513
- }
514
- if( (c & 0xf0) == 0xe0 ) {
515
- /* This is a 3-byte UTF-8 character */
516
- if ( (c & 0xfe) == 0xee ){
517
- /* Range U+E000 - U+FFFF (Starting with 0xee or 0xef in UTF-8 ) */
518
- if ( !(c & 1) || ((z[i+1] & 0xff) < 0xa4) ){
519
- /* Unicode character in the range U+E000 - U+F8FF are for
520
- * private use, they shouldn't occur in filenames. */
521
- return 0;
522
- }
523
- }else if( ((c & 0xff) == 0xed) && ((z[i+1] & 0xe0) == 0xa0) ){
524
- /* Unicode character in the range U+D800 - U+DFFF are for
525
- * surrogate pairs, they shouldn't occur in filenames. */
526
- return 0;
527
- }
528
- }
507
+ for(i=0; (c=(unsigned char)z[i])!=0; i++){
508
+ if( c & maskNonAscii ){
509
+ if( c<0xc2 ){
510
+ /* Invalid 1-byte UTF-8 sequence, or 2-byte overlong form. */
511
+ return 0;
512
+ }else if( (c&0xe0)==0xe0 ){
513
+ /* 3-byte or more */
514
+ int unicode;
515
+ if( c&0x10 ){
516
+ /* Unicode characters > U+FFFF are not supported.
517
+ * Windows XP and earlier cannot handle them.
518
+ */
519
+ return 0;
520
+ }
521
+ /* This is a 3-byte UTF-8 character */
522
+ unicode = ((c&0x0f)<<12) + ((c&0x3f)<<6) + (c&0x3f);
523
+ if( unicode <= 0x07ff ){
524
+ /* overlong form */
525
+ return 0;
526
+ }else if( unicode>=0xe000 ){
527
+ /* U+E000..U+FFFF */
528
+ if( (unicode<=0xf8ff) || (unicode>=0xfffe) ){
529
+ /* U+E000..U+F8FF are for private use.
530
+ * U+FFFE..U+FFFF are noncharacters. */
531
+ return 0;
532
+ } else if( (unicode>=0xfdd0) && (unicode<=0xfdef) ){
533
+ /* U+FDD0..U+FDEF are noncharacters. */
534
+ return 0;
535
+ }
536
+ }else if( (unicode>=0xD800) && (unicode<=0xDFFF) ){
537
+ /* U+D800..U+DFFF are for surrogate pairs. */
538
+ return 0;
539
+ }
540
+ }
541
+ do{
542
+ if( (z[1]&0xc0)!=0x80 ){
543
+ /* Invalid continuation byte (multi-byte UTF-8) */
544
+ return 0;
545
+ }
546
+ /* The hi-bits of c are used to keep track of the number of expected
547
+ * continuation-bytes, so we don't need a separate counter. */
548
+ c<<=1; ++z;
549
+ }while( c>=0xc0 );
529550
}else if( c=='\\' ){
530551
return 0;
531552
}
532553
if( c=='/' ){
533554
if( z[i+1]=='/' ) return 0;
@@ -578,11 +599,11 @@
578599
if( z[i]=='\\' ) z[i] = '/';
579600
}
580601
#endif
581602
582603
/* Removing trailing "/" characters */
583
- if ( !slash ){
604
+ if( !slash ){
584605
while( n>1 && z[n-1]=='/' ){ n--; }
585606
}
586607
587608
/* Remove duplicate '/' characters. Except, two // at the beginning
588609
** of a pathname is allowed since this is important on windows. */
@@ -835,11 +856,11 @@
835856
if( zPwd[i]==0 ){
836857
blob_append(pOut, ".", 1);
837858
}else{
838859
blob_append(pOut, "..", 2);
839860
for(j=i+1; zPwd[j]; j++){
840
- if( zPwd[j]=='/' ) {
861
+ if( zPwd[j]=='/' ){
841862
blob_append(pOut, "/..", 3);
842863
}
843864
}
844865
}
845866
return;
@@ -852,11 +873,11 @@
852873
return;
853874
}
854875
while( zPath[i-1]!='/' ){ i--; }
855876
blob_set(&tmp, "../");
856877
for(j=i; zPwd[j]; j++){
857
- if( zPwd[j]=='/' ) {
878
+ if( zPwd[j]=='/' ){
858879
blob_append(&tmp, "../", 3);
859880
}
860881
}
861882
blob_append(&tmp, &zPath[i], -1);
862883
blob_reset(pOut);
863884
--- src/file.c
+++ src/file.c
@@ -490,44 +490,65 @@
490 ** * Does not contain two or more "/" characters in a row.
491 ** * Contains at least one character
492 **
493 ** Invalid UTF8 characters result in a false return if bStrictUtf8 is
494 ** true. If bStrictUtf8 is false, invalid UTF8 characters are silently
495 ** ignored.
 
496 */
497 int file_is_simple_pathname(const char *z, int bStrictUtf8){
498 int i;
499 char c = z[0];
500 char maskNonAscii = bStrictUtf8 ? 0x80 : 0x00;
501 if( c=='/' || c==0 ) return 0;
502 if( c=='.' ){
503 if( z[1]=='/' || z[1]==0 ) return 0;
504 if( z[1]=='.' && (z[2]=='/' || z[2]==0) ) return 0;
505 }
506 for(i=0; (c=z[i])!=0; i++){
507 if( c & maskNonAscii ){
508 if( (c & 0xf0) == 0xf0 ) {
509 /* Unicode characters > U+FFFF are not supported.
510 * Windows XP and earlier cannot handle them.
511 */
512 return 0;
513 }
514 if( (c & 0xf0) == 0xe0 ) {
515 /* This is a 3-byte UTF-8 character */
516 if ( (c & 0xfe) == 0xee ){
517 /* Range U+E000 - U+FFFF (Starting with 0xee or 0xef in UTF-8 ) */
518 if ( !(c & 1) || ((z[i+1] & 0xff) < 0xa4) ){
519 /* Unicode character in the range U+E000 - U+F8FF are for
520 * private use, they shouldn't occur in filenames. */
521 return 0;
522 }
523 }else if( ((c & 0xff) == 0xed) && ((z[i+1] & 0xe0) == 0xa0) ){
524 /* Unicode character in the range U+D800 - U+DFFF are for
525 * surrogate pairs, they shouldn't occur in filenames. */
526 return 0;
527 }
528 }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
529 }else if( c=='\\' ){
530 return 0;
531 }
532 if( c=='/' ){
533 if( z[i+1]=='/' ) return 0;
@@ -578,11 +599,11 @@
578 if( z[i]=='\\' ) z[i] = '/';
579 }
580 #endif
581
582 /* Removing trailing "/" characters */
583 if ( !slash ){
584 while( n>1 && z[n-1]=='/' ){ n--; }
585 }
586
587 /* Remove duplicate '/' characters. Except, two // at the beginning
588 ** of a pathname is allowed since this is important on windows. */
@@ -835,11 +856,11 @@
835 if( zPwd[i]==0 ){
836 blob_append(pOut, ".", 1);
837 }else{
838 blob_append(pOut, "..", 2);
839 for(j=i+1; zPwd[j]; j++){
840 if( zPwd[j]=='/' ) {
841 blob_append(pOut, "/..", 3);
842 }
843 }
844 }
845 return;
@@ -852,11 +873,11 @@
852 return;
853 }
854 while( zPath[i-1]!='/' ){ i--; }
855 blob_set(&tmp, "../");
856 for(j=i; zPwd[j]; j++){
857 if( zPwd[j]=='/' ) {
858 blob_append(&tmp, "../", 3);
859 }
860 }
861 blob_append(&tmp, &zPath[i], -1);
862 blob_reset(pOut);
863
--- src/file.c
+++ src/file.c
@@ -490,44 +490,65 @@
490 ** * Does not contain two or more "/" characters in a row.
491 ** * Contains at least one character
492 **
493 ** Invalid UTF8 characters result in a false return if bStrictUtf8 is
494 ** true. If bStrictUtf8 is false, invalid UTF8 characters are silently
495 ** ignored. See http://en.wikipedia.org/wiki/UTF-8#Invalid_byte_sequences
496 ** and http://en.wikipedia.org/wiki/Unicode (for the noncharacters)
497 */
498 int file_is_simple_pathname(const char *z, int bStrictUtf8){
499 int i;
500 unsigned char c = (unsigned char) z[0];
501 char maskNonAscii = bStrictUtf8 ? 0x80 : 0x00;
502 if( c=='/' || c==0 ) return 0;
503 if( c=='.' ){
504 if( z[1]=='/' || z[1]==0 ) return 0;
505 if( z[1]=='.' && (z[2]=='/' || z[2]==0) ) return 0;
506 }
507 for(i=0; (c=(unsigned char)z[i])!=0; i++){
508 if( c & maskNonAscii ){
509 if( c<0xc2 ){
510 /* Invalid 1-byte UTF-8 sequence, or 2-byte overlong form. */
511 return 0;
512 }else if( (c&0xe0)==0xe0 ){
513 /* 3-byte or more */
514 int unicode;
515 if( c&0x10 ){
516 /* Unicode characters > U+FFFF are not supported.
517 * Windows XP and earlier cannot handle them.
518 */
519 return 0;
520 }
521 /* This is a 3-byte UTF-8 character */
522 unicode = ((c&0x0f)<<12) + ((c&0x3f)<<6) + (c&0x3f);
523 if( unicode <= 0x07ff ){
524 /* overlong form */
525 return 0;
526 }else if( unicode>=0xe000 ){
527 /* U+E000..U+FFFF */
528 if( (unicode<=0xf8ff) || (unicode>=0xfffe) ){
529 /* U+E000..U+F8FF are for private use.
530 * U+FFFE..U+FFFF are noncharacters. */
531 return 0;
532 } else if( (unicode>=0xfdd0) && (unicode<=0xfdef) ){
533 /* U+FDD0..U+FDEF are noncharacters. */
534 return 0;
535 }
536 }else if( (unicode>=0xD800) && (unicode<=0xDFFF) ){
537 /* U+D800..U+DFFF are for surrogate pairs. */
538 return 0;
539 }
540 }
541 do{
542 if( (z[1]&0xc0)!=0x80 ){
543 /* Invalid continuation byte (multi-byte UTF-8) */
544 return 0;
545 }
546 /* The hi-bits of c are used to keep track of the number of expected
547 * continuation-bytes, so we don't need a separate counter. */
548 c<<=1; ++z;
549 }while( c>=0xc0 );
550 }else if( c=='\\' ){
551 return 0;
552 }
553 if( c=='/' ){
554 if( z[i+1]=='/' ) return 0;
@@ -578,11 +599,11 @@
599 if( z[i]=='\\' ) z[i] = '/';
600 }
601 #endif
602
603 /* Removing trailing "/" characters */
604 if( !slash ){
605 while( n>1 && z[n-1]=='/' ){ n--; }
606 }
607
608 /* Remove duplicate '/' characters. Except, two // at the beginning
609 ** of a pathname is allowed since this is important on windows. */
@@ -835,11 +856,11 @@
856 if( zPwd[i]==0 ){
857 blob_append(pOut, ".", 1);
858 }else{
859 blob_append(pOut, "..", 2);
860 for(j=i+1; zPwd[j]; j++){
861 if( zPwd[j]=='/' ){
862 blob_append(pOut, "/..", 3);
863 }
864 }
865 }
866 return;
@@ -852,11 +873,11 @@
873 return;
874 }
875 while( zPath[i-1]!='/' ){ i--; }
876 blob_set(&tmp, "../");
877 for(j=i; zPwd[j]; j++){
878 if( zPwd[j]=='/' ){
879 blob_append(&tmp, "../", 3);
880 }
881 }
882 blob_append(&tmp, &zPath[i], -1);
883 blob_reset(pOut);
884

Keyboard Shortcuts

Open search /
Next entry (timeline) j
Previous entry (timeline) k
Open focused entry Enter
Show this help ?
Toggle theme Top nav button