Fossil SCM
Generate warning when to-be-committed file contains invalid UTF-8
Commit
4e86b06a9f03db12baffae8509741f5ebd8bcae9
Parent
d804902f2333e41…
2 files changed
+3
-3
+66
-17
+3
-3
| --- src/checkin.c | ||
| +++ src/checkin.c | ||
| @@ -895,11 +895,11 @@ | ||
| 895 | 895 | static int allOk = 0; /* Set to true to disable this routine */ |
| 896 | 896 | |
| 897 | 897 | if( allOk ) return; |
| 898 | 898 | fUnicode = starts_with_utf16_bom(p); |
| 899 | 899 | eType = fUnicode ? looks_like_utf16(p) : looks_like_utf8(p); |
| 900 | - if( eType==0 || eType==-1 || fUnicode ){ | |
| 900 | + if( eType<0 || fUnicode ){ | |
| 901 | 901 | const char *zWarning; |
| 902 | 902 | Blob ans; |
| 903 | 903 | char cReply; |
| 904 | 904 | |
| 905 | 905 | if( eType==-1 && fUnicode ){ |
| @@ -907,12 +907,12 @@ | ||
| 907 | 907 | }else if( eType==-1 ){ |
| 908 | 908 | if( crnlOk ){ |
| 909 | 909 | return; /* We don't want CR/NL warnings for this file. */ |
| 910 | 910 | } |
| 911 | 911 | zWarning = "CR/NL line endings"; |
| 912 | - }else if( eType==0 ){ | |
| 913 | - zWarning = "binary data"; | |
| 912 | + }else if( eType==-2 ){ | |
| 913 | + zWarning = "invalid UTF-8 or ASCII"; | |
| 914 | 914 | }else{ |
| 915 | 915 | zWarning = "Unicode"; |
| 916 | 916 | } |
| 917 | 917 | file_relative_name(zFilename, &fname, 0); |
| 918 | 918 | blob_zero(&ans); |
| 919 | 919 |
| --- src/checkin.c | |
| +++ src/checkin.c | |
| @@ -895,11 +895,11 @@ | |
| 895 | static int allOk = 0; /* Set to true to disable this routine */ |
| 896 | |
| 897 | if( allOk ) return; |
| 898 | fUnicode = starts_with_utf16_bom(p); |
| 899 | eType = fUnicode ? looks_like_utf16(p) : looks_like_utf8(p); |
| 900 | if( eType==0 || eType==-1 || fUnicode ){ |
| 901 | const char *zWarning; |
| 902 | Blob ans; |
| 903 | char cReply; |
| 904 | |
| 905 | if( eType==-1 && fUnicode ){ |
| @@ -907,12 +907,12 @@ | |
| 907 | }else if( eType==-1 ){ |
| 908 | if( crnlOk ){ |
| 909 | return; /* We don't want CR/NL warnings for this file. */ |
| 910 | } |
| 911 | zWarning = "CR/NL line endings"; |
| 912 | }else if( eType==0 ){ |
| 913 | zWarning = "binary data"; |
| 914 | }else{ |
| 915 | zWarning = "Unicode"; |
| 916 | } |
| 917 | file_relative_name(zFilename, &fname, 0); |
| 918 | blob_zero(&ans); |
| 919 |
| --- src/checkin.c | |
| +++ src/checkin.c | |
| @@ -895,11 +895,11 @@ | |
| 895 | static int allOk = 0; /* Set to true to disable this routine */ |
| 896 | |
| 897 | if( allOk ) return; |
| 898 | fUnicode = starts_with_utf16_bom(p); |
| 899 | eType = fUnicode ? looks_like_utf16(p) : looks_like_utf8(p); |
| 900 | if( eType<0 || fUnicode ){ |
| 901 | const char *zWarning; |
| 902 | Blob ans; |
| 903 | char cReply; |
| 904 | |
| 905 | if( eType==-1 && fUnicode ){ |
| @@ -907,12 +907,12 @@ | |
| 907 | }else if( eType==-1 ){ |
| 908 | if( crnlOk ){ |
| 909 | return; /* We don't want CR/NL warnings for this file. */ |
| 910 | } |
| 911 | zWarning = "CR/NL line endings"; |
| 912 | }else if( eType==-2 ){ |
| 913 | zWarning = "invalid UTF-8 or ASCII"; |
| 914 | }else{ |
| 915 | zWarning = "Unicode"; |
| 916 | } |
| 917 | file_relative_name(zFilename, &fname, 0); |
| 918 | blob_zero(&ans); |
| 919 |
+66
-17
| --- src/diff.c | ||
| +++ src/diff.c | ||
| @@ -175,47 +175,96 @@ | ||
| 175 | 175 | ** This function attempts to scan each logical line within the blob to |
| 176 | 176 | ** determine the type of content it appears to contain. Possible return |
| 177 | 177 | ** values are: |
| 178 | 178 | ** |
| 179 | 179 | ** (1) -- The content appears to consist entirely of text, with lines |
| 180 | -** delimited by line-feed characters; however, the encoding may | |
| 181 | -** not be UTF-8. | |
| 180 | +** delimited by line-feed characters. | |
| 182 | 181 | ** |
| 183 | 182 | ** (0) -- The content appears to be binary because it contains embedded |
| 184 | 183 | ** NUL characters or an extremely long line. Since this function |
| 185 | 184 | ** does not understand UTF-16, it may falsely consider UTF-16 text |
| 186 | 185 | ** to be binary. |
| 187 | 186 | ** |
| 188 | 187 | ** (-1) -- The content appears to consist entirely of text, with lines |
| 189 | -** delimited by carriage-return, line-feed pairs; however, the | |
| 190 | -** encoding may not be UTF-8. | |
| 188 | +** delimited by carriage-return, line-feed pairs. | |
| 189 | +** | |
| 190 | +** (-2) -- The content appears to consist entirely of text, with lines | |
| 191 | +** delimited by line-feed characters or carriage-return, | |
| 192 | +** line-feed pairs; however, the encoding is not UTF-8 or ASCII. | |
| 191 | 193 | ** |
| 192 | 194 | */ |
| 195 | + | |
| 193 | 196 | int looks_like_utf8(const Blob *pContent){ |
| 194 | - const char *z = blob_buffer(pContent); | |
| 197 | + unsigned char *z = (unsigned char *) blob_buffer(pContent); | |
| 195 | 198 | unsigned int n = blob_size(pContent); |
| 196 | - int j, c; | |
| 199 | + unsigned int j; | |
| 200 | + unsigned char c; | |
| 197 | 201 | int result = 1; /* Assume UTF-8 text with no CR/NL */ |
| 198 | 202 | |
| 199 | 203 | /* Check individual lines. |
| 200 | 204 | */ |
| 201 | 205 | if( n==0 ) return result; /* Empty file -> text */ |
| 202 | 206 | c = *z; |
| 203 | - if( c==0 ) return 0; /* Zero byte in a file -> binary */ | |
| 207 | + if( c<0x80 ){ | |
| 208 | + if( c==0 ) return 0; /* Zero byte in a file -> binary */ | |
| 209 | + }else if( c<0xC0 ){ | |
| 210 | + result = -2; /* Invalid UTF-8, continue */ | |
| 211 | + }else if( c<0xE0 ){ | |
| 212 | + if( n<2 || ((z[1]&0xC0)!=0x80) ){ | |
| 213 | + result = -2; /* Invalid 2-byte UTF-8, continue */ | |
| 214 | + }else{ | |
| 215 | + --n; ++z; | |
| 216 | + } | |
| 217 | + }else if( c<0xF0 ){ | |
| 218 | + if( n<3 || ((z[1]&0xC0)!=0x80) || ((z[2]&0xC0)!=0x80) ){ | |
| 219 | + result = -2; /* Invalid 3-byte UTF-8, continue */ | |
| 220 | + }else{ | |
| 221 | + n-=2; z+=2; | |
| 222 | + } | |
| 223 | + }else if( c<0xF8 ){ | |
| 224 | + if( n<4 || ((z[1]&0xC0)!=0x80) || ((z[2]&0xC0)!=0x80) || ((z[3]&0xC0)!=0x80) ){ | |
| 225 | + result = -2; /* Invalid 4-byte UTF-8, continue */ | |
| 226 | + }else{ | |
| 227 | + n-=3; z+=3; | |
| 228 | + } | |
| 229 | + }else{ | |
| 230 | + result = -2; /* Invalid multi-byte UTF-8, continue */ | |
| 231 | + } | |
| 204 | 232 | j = (c!='\n'); |
| 205 | 233 | while( --n>0 ){ |
| 206 | 234 | c = *++z; ++j; |
| 207 | - if( c==0 ) return 0; /* Zero byte in a file -> binary */ | |
| 208 | - if( c=='\n' ){ | |
| 209 | - int c2 = z[-1]; | |
| 210 | - if( c2=='\r' ){ | |
| 211 | - result = -1; /* Contains CR/NL, continue */ | |
| 212 | - } | |
| 213 | - if( j>LENGTH_MASK ){ | |
| 214 | - return 0; /* Very long line -> binary */ | |
| 215 | - } | |
| 216 | - j = 0; | |
| 235 | + if( c<0x80 ){ | |
| 236 | + if( c==0 ) return 0; /* Zero byte in a file -> binary */ | |
| 237 | + if( c=='\n' ){ | |
| 238 | + unsigned char c2 = z[-1]; | |
| 239 | + if( c2=='\r' && result>0 ){ | |
| 240 | + result = -1; /* Contains CR/NL, continue */ | |
| 241 | + } | |
| 242 | + if( j>LENGTH_MASK ){ | |
| 243 | + return 0; /* Very long line -> binary */ | |
| 244 | + } | |
| 245 | + j = 0; | |
| 246 | + } | |
| 247 | + }else if( c<0xC0 ){ | |
| 248 | + result = -2; /* Invalid UTF-8, continue */ | |
| 249 | + }else if( c<0xE0 ){ | |
| 250 | + if( n<2 || ((z[1]&0xC0)!=0x80) ){ | |
| 251 | + result = -2; continue; /* Invalid 2-byte UTF-8, continue */ | |
| 252 | + } | |
| 253 | + --n; ++z; | |
| 254 | + }else if( c<0xF0 ){ | |
| 255 | + if( n<3 || ((z[1]&0xC0)!=0x80) || ((z[2]&0xC0)!=0x80) ){ | |
| 256 | + result = -2; continue; /* Invalid 3-byte UTF-8, continue */ | |
| 257 | + } | |
| 258 | + n-=2; z+=2; | |
| 259 | + }else if( c<0xF8 ){ | |
| 260 | + if( n<4 || ((z[1]&0xC0)!=0x80) || ((z[2]&0xC0)!=0x80) || ((z[3]&0xC0)!=0x80) ){ | |
| 261 | + result = -2; continue; /* Invalid 4-byte UTF-8, continue */ | |
| 262 | + } | |
| 263 | + n-=3; z+=3; | |
| 264 | + }else{ | |
| 265 | + result = -2; /* Invalid multi-byte UTF-8, continue */ | |
| 217 | 266 | } |
| 218 | 267 | } |
| 219 | 268 | if( j>LENGTH_MASK ){ |
| 220 | 269 | return 0; /* Very long line -> binary */ |
| 221 | 270 | } |
| 222 | 271 |
| --- src/diff.c | |
| +++ src/diff.c | |
| @@ -175,47 +175,96 @@ | |
| 175 | ** This function attempts to scan each logical line within the blob to |
| 176 | ** determine the type of content it appears to contain. Possible return |
| 177 | ** values are: |
| 178 | ** |
| 179 | ** (1) -- The content appears to consist entirely of text, with lines |
| 180 | ** delimited by line-feed characters; however, the encoding may |
| 181 | ** not be UTF-8. |
| 182 | ** |
| 183 | ** (0) -- The content appears to be binary because it contains embedded |
| 184 | ** NUL characters or an extremely long line. Since this function |
| 185 | ** does not understand UTF-16, it may falsely consider UTF-16 text |
| 186 | ** to be binary. |
| 187 | ** |
| 188 | ** (-1) -- The content appears to consist entirely of text, with lines |
| 189 | ** delimited by carriage-return, line-feed pairs; however, the |
| 190 | ** encoding may not be UTF-8. |
| 191 | ** |
| 192 | */ |
| 193 | int looks_like_utf8(const Blob *pContent){ |
| 194 | const char *z = blob_buffer(pContent); |
| 195 | unsigned int n = blob_size(pContent); |
| 196 | int j, c; |
| 197 | int result = 1; /* Assume UTF-8 text with no CR/NL */ |
| 198 | |
| 199 | /* Check individual lines. |
| 200 | */ |
| 201 | if( n==0 ) return result; /* Empty file -> text */ |
| 202 | c = *z; |
| 203 | if( c==0 ) return 0; /* Zero byte in a file -> binary */ |
| 204 | j = (c!='\n'); |
| 205 | while( --n>0 ){ |
| 206 | c = *++z; ++j; |
| 207 | if( c==0 ) return 0; /* Zero byte in a file -> binary */ |
| 208 | if( c=='\n' ){ |
| 209 | int c2 = z[-1]; |
| 210 | if( c2=='\r' ){ |
| 211 | result = -1; /* Contains CR/NL, continue */ |
| 212 | } |
| 213 | if( j>LENGTH_MASK ){ |
| 214 | return 0; /* Very long line -> binary */ |
| 215 | } |
| 216 | j = 0; |
| 217 | } |
| 218 | } |
| 219 | if( j>LENGTH_MASK ){ |
| 220 | return 0; /* Very long line -> binary */ |
| 221 | } |
| 222 |
| --- src/diff.c | |
| +++ src/diff.c | |
| @@ -175,47 +175,96 @@ | |
| 175 | ** This function attempts to scan each logical line within the blob to |
| 176 | ** determine the type of content it appears to contain. Possible return |
| 177 | ** values are: |
| 178 | ** |
| 179 | ** (1) -- The content appears to consist entirely of text, with lines |
| 180 | ** delimited by line-feed characters. |
| 181 | ** |
| 182 | ** (0) -- The content appears to be binary because it contains embedded |
| 183 | ** NUL characters or an extremely long line. Since this function |
| 184 | ** does not understand UTF-16, it may falsely consider UTF-16 text |
| 185 | ** to be binary. |
| 186 | ** |
| 187 | ** (-1) -- The content appears to consist entirely of text, with lines |
| 188 | ** delimited by carriage-return, line-feed pairs. |
| 189 | ** |
| 190 | ** (-2) -- The content appears to consist entirely of text, with lines |
| 191 | ** delimited by line-feed characters or carriage-return, |
| 192 | ** line-feed pairs; however, the encoding is not UTF-8 or ASCII. |
| 193 | ** |
| 194 | */ |
| 195 | |
| 196 | int looks_like_utf8(const Blob *pContent){ |
| 197 | unsigned char *z = (unsigned char *) blob_buffer(pContent); |
| 198 | unsigned int n = blob_size(pContent); |
| 199 | unsigned int j; |
| 200 | unsigned char c; |
| 201 | int result = 1; /* Assume UTF-8 text with no CR/NL */ |
| 202 | |
| 203 | /* Check individual lines. |
| 204 | */ |
| 205 | if( n==0 ) return result; /* Empty file -> text */ |
| 206 | c = *z; |
| 207 | if( c<0x80 ){ |
| 208 | if( c==0 ) return 0; /* Zero byte in a file -> binary */ |
| 209 | }else if( c<0xC0 ){ |
| 210 | result = -2; /* Invalid UTF-8, continue */ |
| 211 | }else if( c<0xE0 ){ |
| 212 | if( n<2 || ((z[1]&0xC0)!=0x80) ){ |
| 213 | result = -2; /* Invalid 2-byte UTF-8, continue */ |
| 214 | }else{ |
| 215 | --n; ++z; |
| 216 | } |
| 217 | }else if( c<0xF0 ){ |
| 218 | if( n<3 || ((z[1]&0xC0)!=0x80) || ((z[2]&0xC0)!=0x80) ){ |
| 219 | result = -2; /* Invalid 3-byte UTF-8, continue */ |
| 220 | }else{ |
| 221 | n-=2; z+=2; |
| 222 | } |
| 223 | }else if( c<0xF8 ){ |
| 224 | if( n<4 || ((z[1]&0xC0)!=0x80) || ((z[2]&0xC0)!=0x80) || ((z[3]&0xC0)!=0x80) ){ |
| 225 | result = -2; /* Invalid 4-byte UTF-8, continue */ |
| 226 | }else{ |
| 227 | n-=3; z+=3; |
| 228 | } |
| 229 | }else{ |
| 230 | result = -2; /* Invalid multi-byte UTF-8, continue */ |
| 231 | } |
| 232 | j = (c!='\n'); |
| 233 | while( --n>0 ){ |
| 234 | c = *++z; ++j; |
| 235 | if( c<0x80 ){ |
| 236 | if( c==0 ) return 0; /* Zero byte in a file -> binary */ |
| 237 | if( c=='\n' ){ |
| 238 | unsigned char c2 = z[-1]; |
| 239 | if( c2=='\r' && result>0 ){ |
| 240 | result = -1; /* Contains CR/NL, continue */ |
| 241 | } |
| 242 | if( j>LENGTH_MASK ){ |
| 243 | return 0; /* Very long line -> binary */ |
| 244 | } |
| 245 | j = 0; |
| 246 | } |
| 247 | }else if( c<0xC0 ){ |
| 248 | result = -2; /* Invalid UTF-8, continue */ |
| 249 | }else if( c<0xE0 ){ |
| 250 | if( n<2 || ((z[1]&0xC0)!=0x80) ){ |
| 251 | result = -2; continue; /* Invalid 2-byte UTF-8, continue */ |
| 252 | } |
| 253 | --n; ++z; |
| 254 | }else if( c<0xF0 ){ |
| 255 | if( n<3 || ((z[1]&0xC0)!=0x80) || ((z[2]&0xC0)!=0x80) ){ |
| 256 | result = -2; continue; /* Invalid 3-byte UTF-8, continue */ |
| 257 | } |
| 258 | n-=2; z+=2; |
| 259 | }else if( c<0xF8 ){ |
| 260 | if( n<4 || ((z[1]&0xC0)!=0x80) || ((z[2]&0xC0)!=0x80) || ((z[3]&0xC0)!=0x80) ){ |
| 261 | result = -2; continue; /* Invalid 4-byte UTF-8, continue */ |
| 262 | } |
| 263 | n-=3; z+=3; |
| 264 | }else{ |
| 265 | result = -2; /* Invalid multi-byte UTF-8, continue */ |
| 266 | } |
| 267 | } |
| 268 | if( j>LENGTH_MASK ){ |
| 269 | return 0; /* Very long line -> binary */ |
| 270 | } |
| 271 |