Fossil SCM
From the changes.wiki for Fossil 1.25: "Disallow invalid UTF8 characters (such as characters in the surrogate pair range) in filenames." This completes the set of UTF8 characters which are generally considered invalid, so they should be disallowed in filenames: the "overlong form", invalid continuation bytes, and -finally- noncharacters.
Commit
011d5f692d3dc5535bf1a24567c29b8b4dd45e15
Parent
1d462a683f082c6…
1 file changed
+49
-28
+49
-28
| --- src/file.c | ||
| +++ src/file.c | ||
| @@ -490,44 +490,65 @@ | ||
| 490 | 490 | ** * Does not contain two or more "/" characters in a row. |
| 491 | 491 | ** * Contains at least one character |
| 492 | 492 | ** |
| 493 | 493 | ** Invalid UTF8 characters result in a false return if bStrictUtf8 is |
| 494 | 494 | ** true. If bStrictUtf8 is false, invalid UTF8 characters are silently |
| 495 | -** ignored. | |
| 495 | +** ignored. See http://en.wikipedia.org/wiki/UTF-8#Invalid_byte_sequences | |
| 496 | +** and http://en.wikipedia.org/wiki/Unicode (for the noncharacters) | |
| 496 | 497 | */ |
| 497 | 498 | int file_is_simple_pathname(const char *z, int bStrictUtf8){ |
| 498 | 499 | int i; |
| 499 | - char c = z[0]; | |
| 500 | + unsigned char c = (unsigned char) z[0]; | |
| 500 | 501 | char maskNonAscii = bStrictUtf8 ? 0x80 : 0x00; |
| 501 | 502 | if( c=='/' || c==0 ) return 0; |
| 502 | 503 | if( c=='.' ){ |
| 503 | 504 | if( z[1]=='/' || z[1]==0 ) return 0; |
| 504 | 505 | if( z[1]=='.' && (z[2]=='/' || z[2]==0) ) return 0; |
| 505 | 506 | } |
| 506 | - for(i=0; (c=z[i])!=0; i++){ | |
| 507 | - if( c & maskNonAscii ){ | |
| 508 | - if( (c & 0xf0) == 0xf0 ) { | |
| 509 | - /* Unicode characters > U+FFFF are not supported. | |
| 510 | - * Windows XP and earlier cannot handle them. | |
| 511 | - */ | |
| 512 | - return 0; | |
| 513 | - } | |
| 514 | - if( (c & 0xf0) == 0xe0 ) { | |
| 515 | - /* This is a 3-byte UTF-8 character */ | |
| 516 | - if ( (c & 0xfe) == 0xee ){ | |
| 517 | - /* Range U+E000 - U+FFFF (Starting with 0xee or 0xef in UTF-8 ) */ | |
| 518 | - if ( !(c & 1) || ((z[i+1] & 0xff) < 0xa4) ){ | |
| 519 | - /* Unicode character in the range U+E000 - U+F8FF are for | |
| 520 | - * private use, they shouldn't occur in filenames. */ | |
| 521 | - return 0; | |
| 522 | - } | |
| 523 | - }else if( ((c & 0xff) == 0xed) && ((z[i+1] & 0xe0) == 0xa0) ){ | |
| 524 | - /* Unicode character in the range U+D800 - U+DFFF are for | |
| 525 | - * surrogate pairs, they shouldn't occur in filenames. */ | |
| 526 | - return 0; | |
| 527 | - } | |
| 528 | - } | |
| 507 | + for(i=0; (c=(unsigned char)z[i])!=0; i++){ | |
| 508 | + if( c & maskNonAscii ){ | |
| 509 | + if( c<0xc2 ){ | |
| 510 | + /* Invalid 1-byte UTF-8 sequence, or 2-byte overlong form. */ | |
| 511 | + return 0; | |
| 512 | + }else if( (c&0xe0)==0xe0 ){ | |
| 513 | + /* 3-byte or more */ | |
| 514 | + int unicode; | |
| 515 | + if( c&0x10 ){ | |
| 516 | + /* Unicode characters > U+FFFF are not supported. | |
| 517 | + * Windows XP and earlier cannot handle them. | |
| 518 | + */ | |
| 519 | + return 0; | |
| 520 | + } | |
| 521 | + /* This is a 3-byte UTF-8 character */ | |
| 522 | + unicode = ((c&0x0f)<<12) + ((c&0x3f)<<6) + (c&0x3f); | |
| 523 | + if( unicode <= 0x07ff ){ | |
| 524 | + /* overlong form */ | |
| 525 | + return 0; | |
| 526 | + }else if( unicode>=0xe000 ){ | |
| 527 | + /* U+E000..U+FFFF */ | |
| 528 | + if( (unicode<=0xf8ff) || (unicode>=0xfffe) ){ | |
| 529 | + /* U+E000..U+F8FF are for private use. | |
| 530 | + * U+FFFE..U+FFFF are noncharacters. */ | |
| 531 | + return 0; | |
| 532 | + } else if( (unicode>=0xfdd0) && (unicode<=0xfdef) ){ | |
| 533 | + /* U+FDD0..U+FDEF are noncharacters. */ | |
| 534 | + return 0; | |
| 535 | + } | |
| 536 | + }else if( (unicode>=0xD800) && (unicode<=0xDFFF) ){ | |
| 537 | + /* U+D800..U+DFFF are for surrogate pairs. */ | |
| 538 | + return 0; | |
| 539 | + } | |
| 540 | + } | |
| 541 | + do{ | |
| 542 | + if( (z[1]&0xc0)!=0x80 ){ | |
| 543 | + /* Invalid continuation byte (multi-byte UTF-8) */ | |
| 544 | + return 0; | |
| 545 | + } | |
| 546 | + /* The hi-bits of c are used to keep track of the number of expected | |
| 547 | + * continuation-bytes, so we don't need a separate counter. */ | |
| 548 | + c<<=1; ++z; | |
| 549 | + }while( c>=0xc0 ); | |
| 529 | 550 | }else if( c=='\\' ){ |
| 530 | 551 | return 0; |
| 531 | 552 | } |
| 532 | 553 | if( c=='/' ){ |
| 533 | 554 | if( z[i+1]=='/' ) return 0; |
| @@ -578,11 +599,11 @@ | ||
| 578 | 599 | if( z[i]=='\\' ) z[i] = '/'; |
| 579 | 600 | } |
| 580 | 601 | #endif |
| 581 | 602 | |
| 582 | 603 | /* Removing trailing "/" characters */ |
| 583 | - if ( !slash ){ | |
| 604 | + if( !slash ){ | |
| 584 | 605 | while( n>1 && z[n-1]=='/' ){ n--; } |
| 585 | 606 | } |
| 586 | 607 | |
| 587 | 608 | /* Remove duplicate '/' characters. Except, two // at the beginning |
| 588 | 609 | ** of a pathname is allowed since this is important on windows. */ |
| @@ -835,11 +856,11 @@ | ||
| 835 | 856 | if( zPwd[i]==0 ){ |
| 836 | 857 | blob_append(pOut, ".", 1); |
| 837 | 858 | }else{ |
| 838 | 859 | blob_append(pOut, "..", 2); |
| 839 | 860 | for(j=i+1; zPwd[j]; j++){ |
| 840 | - if( zPwd[j]=='/' ) { | |
| 861 | + if( zPwd[j]=='/' ){ | |
| 841 | 862 | blob_append(pOut, "/..", 3); |
| 842 | 863 | } |
| 843 | 864 | } |
| 844 | 865 | } |
| 845 | 866 | return; |
| @@ -852,11 +873,11 @@ | ||
| 852 | 873 | return; |
| 853 | 874 | } |
| 854 | 875 | while( zPath[i-1]!='/' ){ i--; } |
| 855 | 876 | blob_set(&tmp, "../"); |
| 856 | 877 | for(j=i; zPwd[j]; j++){ |
| 857 | - if( zPwd[j]=='/' ) { | |
| 878 | + if( zPwd[j]=='/' ){ | |
| 858 | 879 | blob_append(&tmp, "../", 3); |
| 859 | 880 | } |
| 860 | 881 | } |
| 861 | 882 | blob_append(&tmp, &zPath[i], -1); |
| 862 | 883 | blob_reset(pOut); |
| 863 | 884 |
| --- src/file.c | |
| +++ src/file.c | |
| @@ -490,44 +490,65 @@ | |
| 490 | ** * Does not contain two or more "/" characters in a row. |
| 491 | ** * Contains at least one character |
| 492 | ** |
| 493 | ** Invalid UTF8 characters result in a false return if bStrictUtf8 is |
| 494 | ** true. If bStrictUtf8 is false, invalid UTF8 characters are silently |
| 495 | ** ignored. |
| 496 | */ |
| 497 | int file_is_simple_pathname(const char *z, int bStrictUtf8){ |
| 498 | int i; |
| 499 | char c = z[0]; |
| 500 | char maskNonAscii = bStrictUtf8 ? 0x80 : 0x00; |
| 501 | if( c=='/' || c==0 ) return 0; |
| 502 | if( c=='.' ){ |
| 503 | if( z[1]=='/' || z[1]==0 ) return 0; |
| 504 | if( z[1]=='.' && (z[2]=='/' || z[2]==0) ) return 0; |
| 505 | } |
| 506 | for(i=0; (c=z[i])!=0; i++){ |
| 507 | if( c & maskNonAscii ){ |
| 508 | if( (c & 0xf0) == 0xf0 ) { |
| 509 | /* Unicode characters > U+FFFF are not supported. |
| 510 | * Windows XP and earlier cannot handle them. |
| 511 | */ |
| 512 | return 0; |
| 513 | } |
| 514 | if( (c & 0xf0) == 0xe0 ) { |
| 515 | /* This is a 3-byte UTF-8 character */ |
| 516 | if ( (c & 0xfe) == 0xee ){ |
| 517 | /* Range U+E000 - U+FFFF (Starting with 0xee or 0xef in UTF-8 ) */ |
| 518 | if ( !(c & 1) || ((z[i+1] & 0xff) < 0xa4) ){ |
| 519 | /* Unicode character in the range U+E000 - U+F8FF are for |
| 520 | * private use, they shouldn't occur in filenames. */ |
| 521 | return 0; |
| 522 | } |
| 523 | }else if( ((c & 0xff) == 0xed) && ((z[i+1] & 0xe0) == 0xa0) ){ |
| 524 | /* Unicode character in the range U+D800 - U+DFFF are for |
| 525 | * surrogate pairs, they shouldn't occur in filenames. */ |
| 526 | return 0; |
| 527 | } |
| 528 | } |
| 529 | }else if( c=='\\' ){ |
| 530 | return 0; |
| 531 | } |
| 532 | if( c=='/' ){ |
| 533 | if( z[i+1]=='/' ) return 0; |
| @@ -578,11 +599,11 @@ | |
| 578 | if( z[i]=='\\' ) z[i] = '/'; |
| 579 | } |
| 580 | #endif |
| 581 | |
| 582 | /* Removing trailing "/" characters */ |
| 583 | if ( !slash ){ |
| 584 | while( n>1 && z[n-1]=='/' ){ n--; } |
| 585 | } |
| 586 | |
| 587 | /* Remove duplicate '/' characters. Except, two // at the beginning |
| 588 | ** of a pathname is allowed since this is important on windows. */ |
| @@ -835,11 +856,11 @@ | |
| 835 | if( zPwd[i]==0 ){ |
| 836 | blob_append(pOut, ".", 1); |
| 837 | }else{ |
| 838 | blob_append(pOut, "..", 2); |
| 839 | for(j=i+1; zPwd[j]; j++){ |
| 840 | if( zPwd[j]=='/' ) { |
| 841 | blob_append(pOut, "/..", 3); |
| 842 | } |
| 843 | } |
| 844 | } |
| 845 | return; |
| @@ -852,11 +873,11 @@ | |
| 852 | return; |
| 853 | } |
| 854 | while( zPath[i-1]!='/' ){ i--; } |
| 855 | blob_set(&tmp, "../"); |
| 856 | for(j=i; zPwd[j]; j++){ |
| 857 | if( zPwd[j]=='/' ) { |
| 858 | blob_append(&tmp, "../", 3); |
| 859 | } |
| 860 | } |
| 861 | blob_append(&tmp, &zPath[i], -1); |
| 862 | blob_reset(pOut); |
| 863 |
| --- src/file.c | |
| +++ src/file.c | |
| @@ -490,44 +490,65 @@ | |
| 490 | ** * Does not contain two or more "/" characters in a row. |
| 491 | ** * Contains at least one character |
| 492 | ** |
| 493 | ** Invalid UTF8 characters result in a false return if bStrictUtf8 is |
| 494 | ** true. If bStrictUtf8 is false, invalid UTF8 characters are silently |
| 495 | ** ignored. See http://en.wikipedia.org/wiki/UTF-8#Invalid_byte_sequences |
| 496 | ** and http://en.wikipedia.org/wiki/Unicode (for the noncharacters) |
| 497 | */ |
| 498 | int file_is_simple_pathname(const char *z, int bStrictUtf8){ |
| 499 | int i; |
| 500 | unsigned char c = (unsigned char) z[0]; |
| 501 | char maskNonAscii = bStrictUtf8 ? 0x80 : 0x00; |
| 502 | if( c=='/' || c==0 ) return 0; |
| 503 | if( c=='.' ){ |
| 504 | if( z[1]=='/' || z[1]==0 ) return 0; |
| 505 | if( z[1]=='.' && (z[2]=='/' || z[2]==0) ) return 0; |
| 506 | } |
| 507 | for(i=0; (c=(unsigned char)z[i])!=0; i++){ |
| 508 | if( c & maskNonAscii ){ |
| 509 | if( c<0xc2 ){ |
| 510 | /* Invalid 1-byte UTF-8 sequence, or 2-byte overlong form. */ |
| 511 | return 0; |
| 512 | }else if( (c&0xe0)==0xe0 ){ |
| 513 | /* 3-byte or more */ |
| 514 | int unicode; |
| 515 | if( c&0x10 ){ |
| 516 | /* Unicode characters > U+FFFF are not supported. |
| 517 | * Windows XP and earlier cannot handle them. |
| 518 | */ |
| 519 | return 0; |
| 520 | } |
| 521 | /* This is a 3-byte UTF-8 character */ |
| 522 | unicode = ((c&0x0f)<<12) + ((c&0x3f)<<6) + (c&0x3f); |
| 523 | if( unicode <= 0x07ff ){ |
| 524 | /* overlong form */ |
| 525 | return 0; |
| 526 | }else if( unicode>=0xe000 ){ |
| 527 | /* U+E000..U+FFFF */ |
| 528 | if( (unicode<=0xf8ff) || (unicode>=0xfffe) ){ |
| 529 | /* U+E000..U+F8FF are for private use. |
| 530 | * U+FFFE..U+FFFF are noncharacters. */ |
| 531 | return 0; |
| 532 | } else if( (unicode>=0xfdd0) && (unicode<=0xfdef) ){ |
| 533 | /* U+FDD0..U+FDEF are noncharacters. */ |
| 534 | return 0; |
| 535 | } |
| 536 | }else if( (unicode>=0xD800) && (unicode<=0xDFFF) ){ |
| 537 | /* U+D800..U+DFFF are for surrogate pairs. */ |
| 538 | return 0; |
| 539 | } |
| 540 | } |
| 541 | do{ |
| 542 | if( (z[1]&0xc0)!=0x80 ){ |
| 543 | /* Invalid continuation byte (multi-byte UTF-8) */ |
| 544 | return 0; |
| 545 | } |
| 546 | /* The hi-bits of c are used to keep track of the number of expected |
| 547 | * continuation-bytes, so we don't need a separate counter. */ |
| 548 | c<<=1; ++z; |
| 549 | }while( c>=0xc0 ); |
| 550 | }else if( c=='\\' ){ |
| 551 | return 0; |
| 552 | } |
| 553 | if( c=='/' ){ |
| 554 | if( z[i+1]=='/' ) return 0; |
| @@ -578,11 +599,11 @@ | |
| 599 | if( z[i]=='\\' ) z[i] = '/'; |
| 600 | } |
| 601 | #endif |
| 602 | |
| 603 | /* Removing trailing "/" characters */ |
| 604 | if( !slash ){ |
| 605 | while( n>1 && z[n-1]=='/' ){ n--; } |
| 606 | } |
| 607 | |
| 608 | /* Remove duplicate '/' characters. Except, two // at the beginning |
| 609 | ** of a pathname is allowed since this is important on windows. */ |
| @@ -835,11 +856,11 @@ | |
| 856 | if( zPwd[i]==0 ){ |
| 857 | blob_append(pOut, ".", 1); |
| 858 | }else{ |
| 859 | blob_append(pOut, "..", 2); |
| 860 | for(j=i+1; zPwd[j]; j++){ |
| 861 | if( zPwd[j]=='/' ){ |
| 862 | blob_append(pOut, "/..", 3); |
| 863 | } |
| 864 | } |
| 865 | } |
| 866 | return; |
| @@ -852,11 +873,11 @@ | |
| 873 | return; |
| 874 | } |
| 875 | while( zPath[i-1]!='/' ){ i--; } |
| 876 | blob_set(&tmp, "../"); |
| 877 | for(j=i; zPwd[j]; j++){ |
| 878 | if( zPwd[j]=='/' ){ |
| 879 | blob_append(&tmp, "../", 3); |
| 880 | } |
| 881 | } |
| 882 | blob_append(&tmp, &zPath[i], -1); |
| 883 | blob_reset(pOut); |
| 884 |