Fossil SCM
Improve detection of UTF-8, UTF-16, binary data, and carriage returns during commit operations.
Commit
c837e444450dec51a18bed1accb893d8bf35652a
Parent
ef6c243ed929837…
2 files changed
+4
-2
+77
-9
+4
-2
| --- src/checkin.c | ||
| +++ src/checkin.c | ||
| @@ -886,19 +886,19 @@ | ||
| 886 | 886 | ** Issue a warning and give the user an opportunity to abandon out |
| 887 | 887 | ** if a Unicode (UTF-16) byte-order-mark (BOM) or a \r\n line ending |
| 888 | 888 | ** is seen in a text file. |
| 889 | 889 | */ |
| 890 | 890 | static void commit_warning(const Blob *p, int crnlOk, const char *zFilename){ |
| 891 | - int eType; /* return value of looks_like_text() */ | |
| 891 | + int eType; /* return value of looks_like_utf8/utf16() */ | |
| 892 | 892 | int fUnicode; /* return value of starts_with_utf16_bom() */ |
| 893 | 893 | char *zMsg; /* Warning message */ |
| 894 | 894 | Blob fname; /* Relative pathname of the file */ |
| 895 | 895 | static int allOk = 0; /* Set to true to disable this routine */ |
| 896 | 896 | |
| 897 | 897 | if( allOk ) return; |
| 898 | - eType = looks_like_text(p); | |
| 899 | 898 | fUnicode = starts_with_utf16_bom(p); |
| 899 | + eType = fUnicode ? looks_like_utf16(p) : looks_like_utf8(p); | |
| 900 | 900 | if( eType==-1 || fUnicode ){ |
| 901 | 901 | const char *zWarning; |
| 902 | 902 | Blob ans; |
| 903 | 903 | char cReply; |
| 904 | 904 | |
| @@ -907,10 +907,12 @@ | ||
| 907 | 907 | }else if( eType==-1 ){ |
| 908 | 908 | if( crnlOk ){ |
| 909 | 909 | return; /* We don't want CR/NL warnings for this file. */ |
| 910 | 910 | } |
| 911 | 911 | zWarning = "CR/NL line endings"; |
| 912 | + }else if( eType==0 ){ | |
| 913 | + zWarning = "binary data"; | |
| 912 | 914 | }else{ |
| 913 | 915 | zWarning = "Unicode"; |
| 914 | 916 | } |
| 915 | 917 | file_relative_name(zFilename, &fname, 0); |
| 916 | 918 | blob_zero(&ans); |
| 917 | 919 |
| --- src/checkin.c | |
| +++ src/checkin.c | |
| @@ -886,19 +886,19 @@ | |
| 886 | ** Issue a warning and give the user an opportunity to abandon out |
| 887 | ** if a Unicode (UTF-16) byte-order-mark (BOM) or a \r\n line ending |
| 888 | ** is seen in a text file. |
| 889 | */ |
| 890 | static void commit_warning(const Blob *p, int crnlOk, const char *zFilename){ |
| 891 | int eType; /* return value of looks_like_text() */ |
| 892 | int fUnicode; /* return value of starts_with_utf16_bom() */ |
| 893 | char *zMsg; /* Warning message */ |
| 894 | Blob fname; /* Relative pathname of the file */ |
| 895 | static int allOk = 0; /* Set to true to disable this routine */ |
| 896 | |
| 897 | if( allOk ) return; |
| 898 | eType = looks_like_text(p); |
| 899 | fUnicode = starts_with_utf16_bom(p); |
| 900 | if( eType==-1 || fUnicode ){ |
| 901 | const char *zWarning; |
| 902 | Blob ans; |
| 903 | char cReply; |
| 904 | |
| @@ -907,10 +907,12 @@ | |
| 907 | }else if( eType==-1 ){ |
| 908 | if( crnlOk ){ |
| 909 | return; /* We don't want CR/NL warnings for this file. */ |
| 910 | } |
| 911 | zWarning = "CR/NL line endings"; |
| 912 | }else{ |
| 913 | zWarning = "Unicode"; |
| 914 | } |
| 915 | file_relative_name(zFilename, &fname, 0); |
| 916 | blob_zero(&ans); |
| 917 |
| --- src/checkin.c | |
| +++ src/checkin.c | |
| @@ -886,19 +886,19 @@ | |
| 886 | ** Issue a warning and give the user an opportunity to abandon out |
| 887 | ** if a Unicode (UTF-16) byte-order-mark (BOM) or a \r\n line ending |
| 888 | ** is seen in a text file. |
| 889 | */ |
| 890 | static void commit_warning(const Blob *p, int crnlOk, const char *zFilename){ |
| 891 | int eType; /* return value of looks_like_utf8/utf16() */ |
| 892 | int fUnicode; /* return value of starts_with_utf16_bom() */ |
| 893 | char *zMsg; /* Warning message */ |
| 894 | Blob fname; /* Relative pathname of the file */ |
| 895 | static int allOk = 0; /* Set to true to disable this routine */ |
| 896 | |
| 897 | if( allOk ) return; |
| 898 | fUnicode = starts_with_utf16_bom(p); |
| 899 | eType = fUnicode ? looks_like_utf16(p) : looks_like_utf8(p); |
| 900 | if( eType==-1 || fUnicode ){ |
| 901 | const char *zWarning; |
| 902 | Blob ans; |
| 903 | char cReply; |
| 904 | |
| @@ -907,10 +907,12 @@ | |
| 907 | }else if( eType==-1 ){ |
| 908 | if( crnlOk ){ |
| 909 | return; /* We don't want CR/NL warnings for this file. */ |
| 910 | } |
| 911 | zWarning = "CR/NL line endings"; |
| 912 | }else if( eType==0 ){ |
| 913 | zWarning = "binary data"; |
| 914 | }else{ |
| 915 | zWarning = "Unicode"; |
| 916 | } |
| 917 | file_relative_name(zFilename, &fname, 0); |
| 918 | blob_zero(&ans); |
| 919 |
+77
-9
| --- src/diff.c | ||
| +++ src/diff.c | ||
| @@ -48,15 +48,15 @@ | ||
| 48 | 48 | "cannot compute difference between binary files\n" |
| 49 | 49 | |
| 50 | 50 | #define DIFF_CANNOT_COMPUTE_SYMLINK \ |
| 51 | 51 | "cannot compute difference between symlink and regular file\n" |
| 52 | 52 | |
| 53 | -#define looks_like_binary(blob) (looks_like_text((blob)) == 0) | |
| 53 | +#define looks_like_binary(blob) (looks_like_utf8((blob)) == 0) | |
| 54 | 54 | #endif /* INTERFACE */ |
| 55 | 55 | |
| 56 | 56 | /* |
| 57 | -** Maximum length of a line in a text file. (8192) | |
| 57 | +** Maximum length of a line in a text file, in bytes. (8192) | |
| 58 | 58 | */ |
| 59 | 59 | #define LENGTH_MASK_SZ 13 |
| 60 | 60 | #define LENGTH_MASK ((1<<LENGTH_MASK_SZ)-1) |
| 61 | 61 | |
| 62 | 62 | /* |
| @@ -179,34 +179,34 @@ | ||
| 179 | 179 | ** (1) -- The content appears to consist entirely of text, with lines |
| 180 | 180 | ** delimited by line-feed characters; however, the encoding may |
| 181 | 181 | ** not be UTF-8. |
| 182 | 182 | ** |
| 183 | 183 | ** (0) -- The content appears to be binary because it contains embedded |
| 184 | -** NUL (\000) characters or an extremely long line. Since this | |
| 185 | -** function does not understand UTF-16, it may falsely consider | |
| 186 | -** UTF-16 text to be binary. | |
| 184 | +** NUL characters or an extremely long line. Since this function | |
| 185 | +** does not understand UTF-16, it may falsely consider UTF-16 text | |
| 186 | +** to be binary. | |
| 187 | 187 | ** |
| 188 | 188 | ** (-1) -- The content appears to consist entirely of text, with lines |
| 189 | 189 | ** delimited by carriage-return, line-feed pairs; however, the |
| 190 | 190 | ** encoding may not be UTF-8. |
| 191 | 191 | ** |
| 192 | 192 | */ |
| 193 | -int looks_like_text(const Blob *pContent){ | |
| 193 | +int looks_like_utf8(const Blob *pContent){ | |
| 194 | 194 | const char *z = blob_buffer(pContent); |
| 195 | 195 | unsigned int n = blob_size(pContent); |
| 196 | 196 | int j, c; |
| 197 | - int result = 1; /* Assume text with no CR/NL */ | |
| 197 | + int result = 1; /* Assume UTF-8 text with no CR/NL */ | |
| 198 | 198 | |
| 199 | 199 | /* Check individual lines. |
| 200 | 200 | */ |
| 201 | 201 | if( n==0 ) return result; /* Empty file -> text */ |
| 202 | 202 | c = *z; |
| 203 | - if( c==0 ) return 0; /* \000 byte in a file -> binary */ | |
| 203 | + if( c==0 ) return 0; /* Zero byte in a file -> binary */ | |
| 204 | 204 | j = (c!='\n'); |
| 205 | 205 | while( --n>0 ){ |
| 206 | 206 | c = *++z; ++j; |
| 207 | - if( c==0 ) return 0; /* \000 byte in a file -> binary */ | |
| 207 | + if( c==0 ) return 0; /* Zero byte in a file -> binary */ | |
| 208 | 208 | if( c=='\n' ){ |
| 209 | 209 | if( z[-1]=='\r' ){ |
| 210 | 210 | result = -1; /* Contains CR/NL, continue */ |
| 211 | 211 | } |
| 212 | 212 | if( j>LENGTH_MASK ){ |
| @@ -215,10 +215,78 @@ | ||
| 215 | 215 | j = 0; |
| 216 | 216 | } |
| 217 | 217 | } |
| 218 | 218 | if( j>LENGTH_MASK ){ |
| 219 | 219 | return 0; /* Very long line -> binary */ |
| 220 | + } | |
| 221 | + return result; /* No problems seen -> not binary */ | |
| 222 | +} | |
| 223 | + | |
| 224 | +/* | |
| 225 | +** Maximum length of a line in a text file, in UTF-16 characters. (4096) | |
| 226 | +** The number of bytes represented by this value cannot exceed LENGTH_MASK | |
| 227 | +** bytes, because that is the line buffer size by the diff engine. | |
| 228 | +*/ | |
| 229 | +#define UTF16_LENGTH_MASK_SZ (LENGTH_MASK_SZ-1) | |
| 230 | +#define UTF16_LENGTH_MASK ((1<<UTF16_LENGTH_MASK_SZ)-1) | |
| 231 | + | |
| 232 | +/* | |
| 233 | +** The carriage-return / line-feed characters in the UTF-16be and UTF-16le | |
| 234 | +** encodings. | |
| 235 | +*/ | |
| 236 | +#define UTF16BE_CR ((wchar_t)'\r') | |
| 237 | +#define UTF16BE_LF ((wchar_t)'\n') | |
| 238 | +#define UTF16LE_CR (((wchar_t)'\r')<<(sizeof(wchar_t)<<2)) | |
| 239 | +#define UTF16LE_LF (((wchar_t)'\n')<<(sizeof(wchar_t)<<2)) | |
| 240 | + | |
| 241 | +/* | |
| 242 | +** This function attempts to scan each logical line within the blob to | |
| 243 | +** determine the type of content it appears to contain. Possible return | |
| 244 | +** values are: | |
| 245 | +** | |
| 246 | +** (1) -- The content appears to consist entirely of text, with lines | |
| 247 | +** delimited by line-feed characters; however, the encoding may | |
| 248 | +** not be UTF-16. | |
| 249 | +** | |
| 250 | +** (0) -- The content appears to be binary because it contains embedded | |
| 251 | +** NUL characters or an extremely long line. Since this function | |
| 252 | +** does not understand UTF-8, it may falsely consider UTF-8 text | |
| 253 | +** to be binary. | |
| 254 | +** | |
| 255 | +** (-1) -- The content appears to consist entirely of text, with lines | |
| 256 | +** delimited by carriage-return, line-feed pairs; however, the | |
| 257 | +** encoding may not be UTF-16. | |
| 258 | +** | |
| 259 | +*/ | |
| 260 | +int looks_like_utf16(const Blob *pContent){ | |
| 261 | + const wchar_t *z = (wchar_t *)blob_buffer(pContent); | |
| 262 | + unsigned int n = blob_size(pContent); | |
| 263 | + int j, c; | |
| 264 | + int result = 1; /* Assume UTF-16 text with no CR/NL */ | |
| 265 | + | |
| 266 | + /* Check individual lines. | |
| 267 | + */ | |
| 268 | + if( n==0 ) return result; /* Empty file -> text */ | |
| 269 | + if( n%2 ) return 0; /* Odd number of bytes -> binary (or UTF-8) */ | |
| 270 | + c = *z; | |
| 271 | + if( c==0 ) return 0; /* NUL character in a file -> binary */ | |
| 272 | + j = ((c!=UTF16BE_LF) && (c!=UTF16LE_LF)); | |
| 273 | + while( (n-=2)>0 ){ | |
| 274 | + c = *++z; ++j; | |
| 275 | + if( c==0 ) return 0; /* NUL character in a file -> binary */ | |
| 276 | + if( c==UTF16BE_LF || c==UTF16LE_LF ){ | |
| 277 | + if( z[-1]==UTF16BE_CR || z[-1]==UTF16LE_CR ){ | |
| 278 | + result = -1; /* Contains CR/NL, continue */ | |
| 279 | + } | |
| 280 | + if( j>UTF16_LENGTH_MASK ){ | |
| 281 | + return 0; /* Very long line -> binary */ | |
| 282 | + } | |
| 283 | + j = 0; | |
| 284 | + } | |
| 285 | + } | |
| 286 | + if( j>UTF16_LENGTH_MASK ){ | |
| 287 | + return 0; /* Very long line -> binary */ | |
| 220 | 288 | } |
| 221 | 289 | return result; /* No problems seen -> not binary */ |
| 222 | 290 | } |
| 223 | 291 | |
| 224 | 292 | /* |
| 225 | 293 |
| --- src/diff.c | |
| +++ src/diff.c | |
| @@ -48,15 +48,15 @@ | |
| 48 | "cannot compute difference between binary files\n" |
| 49 | |
| 50 | #define DIFF_CANNOT_COMPUTE_SYMLINK \ |
| 51 | "cannot compute difference between symlink and regular file\n" |
| 52 | |
| 53 | #define looks_like_binary(blob) (looks_like_text((blob)) == 0) |
| 54 | #endif /* INTERFACE */ |
| 55 | |
| 56 | /* |
| 57 | ** Maximum length of a line in a text file. (8192) |
| 58 | */ |
| 59 | #define LENGTH_MASK_SZ 13 |
| 60 | #define LENGTH_MASK ((1<<LENGTH_MASK_SZ)-1) |
| 61 | |
| 62 | /* |
| @@ -179,34 +179,34 @@ | |
| 179 | ** (1) -- The content appears to consist entirely of text, with lines |
| 180 | ** delimited by line-feed characters; however, the encoding may |
| 181 | ** not be UTF-8. |
| 182 | ** |
| 183 | ** (0) -- The content appears to be binary because it contains embedded |
| 184 | ** NUL (\000) characters or an extremely long line. Since this |
| 185 | ** function does not understand UTF-16, it may falsely consider |
| 186 | ** UTF-16 text to be binary. |
| 187 | ** |
| 188 | ** (-1) -- The content appears to consist entirely of text, with lines |
| 189 | ** delimited by carriage-return, line-feed pairs; however, the |
| 190 | ** encoding may not be UTF-8. |
| 191 | ** |
| 192 | */ |
| 193 | int looks_like_text(const Blob *pContent){ |
| 194 | const char *z = blob_buffer(pContent); |
| 195 | unsigned int n = blob_size(pContent); |
| 196 | int j, c; |
| 197 | int result = 1; /* Assume text with no CR/NL */ |
| 198 | |
| 199 | /* Check individual lines. |
| 200 | */ |
| 201 | if( n==0 ) return result; /* Empty file -> text */ |
| 202 | c = *z; |
| 203 | if( c==0 ) return 0; /* \000 byte in a file -> binary */ |
| 204 | j = (c!='\n'); |
| 205 | while( --n>0 ){ |
| 206 | c = *++z; ++j; |
| 207 | if( c==0 ) return 0; /* \000 byte in a file -> binary */ |
| 208 | if( c=='\n' ){ |
| 209 | if( z[-1]=='\r' ){ |
| 210 | result = -1; /* Contains CR/NL, continue */ |
| 211 | } |
| 212 | if( j>LENGTH_MASK ){ |
| @@ -215,10 +215,78 @@ | |
| 215 | j = 0; |
| 216 | } |
| 217 | } |
| 218 | if( j>LENGTH_MASK ){ |
| 219 | return 0; /* Very long line -> binary */ |
| 220 | } |
| 221 | return result; /* No problems seen -> not binary */ |
| 222 | } |
| 223 | |
| 224 | /* |
| 225 |
| --- src/diff.c | |
| +++ src/diff.c | |
| @@ -48,15 +48,15 @@ | |
| 48 | "cannot compute difference between binary files\n" |
| 49 | |
| 50 | #define DIFF_CANNOT_COMPUTE_SYMLINK \ |
| 51 | "cannot compute difference between symlink and regular file\n" |
| 52 | |
| 53 | #define looks_like_binary(blob) (looks_like_utf8((blob)) == 0) |
| 54 | #endif /* INTERFACE */ |
| 55 | |
| 56 | /* |
| 57 | ** Maximum length of a line in a text file, in bytes. (8192) |
| 58 | */ |
| 59 | #define LENGTH_MASK_SZ 13 |
| 60 | #define LENGTH_MASK ((1<<LENGTH_MASK_SZ)-1) |
| 61 | |
| 62 | /* |
| @@ -179,34 +179,34 @@ | |
| 179 | ** (1) -- The content appears to consist entirely of text, with lines |
| 180 | ** delimited by line-feed characters; however, the encoding may |
| 181 | ** not be UTF-8. |
| 182 | ** |
| 183 | ** (0) -- The content appears to be binary because it contains embedded |
| 184 | ** NUL characters or an extremely long line. Since this function |
| 185 | ** does not understand UTF-16, it may falsely consider UTF-16 text |
| 186 | ** to be binary. |
| 187 | ** |
| 188 | ** (-1) -- The content appears to consist entirely of text, with lines |
| 189 | ** delimited by carriage-return, line-feed pairs; however, the |
| 190 | ** encoding may not be UTF-8. |
| 191 | ** |
| 192 | */ |
| 193 | int looks_like_utf8(const Blob *pContent){ |
| 194 | const char *z = blob_buffer(pContent); |
| 195 | unsigned int n = blob_size(pContent); |
| 196 | int j, c; |
| 197 | int result = 1; /* Assume UTF-8 text with no CR/NL */ |
| 198 | |
| 199 | /* Check individual lines. |
| 200 | */ |
| 201 | if( n==0 ) return result; /* Empty file -> text */ |
| 202 | c = *z; |
| 203 | if( c==0 ) return 0; /* Zero byte in a file -> binary */ |
| 204 | j = (c!='\n'); |
| 205 | while( --n>0 ){ |
| 206 | c = *++z; ++j; |
| 207 | if( c==0 ) return 0; /* Zero byte in a file -> binary */ |
| 208 | if( c=='\n' ){ |
| 209 | if( z[-1]=='\r' ){ |
| 210 | result = -1; /* Contains CR/NL, continue */ |
| 211 | } |
| 212 | if( j>LENGTH_MASK ){ |
| @@ -215,10 +215,78 @@ | |
| 215 | j = 0; |
| 216 | } |
| 217 | } |
| 218 | if( j>LENGTH_MASK ){ |
| 219 | return 0; /* Very long line -> binary */ |
| 220 | } |
| 221 | return result; /* No problems seen -> not binary */ |
| 222 | } |
| 223 | |
| 224 | /* |
| 225 | ** Maximum length of a line in a text file, in UTF-16 characters. (4096) |
| 226 | ** The number of bytes represented by this value cannot exceed LENGTH_MASK |
| 227 | ** bytes, because that is the line buffer size by the diff engine. |
| 228 | */ |
| 229 | #define UTF16_LENGTH_MASK_SZ (LENGTH_MASK_SZ-1) |
| 230 | #define UTF16_LENGTH_MASK ((1<<UTF16_LENGTH_MASK_SZ)-1) |
| 231 | |
| 232 | /* |
| 233 | ** The carriage-return / line-feed characters in the UTF-16be and UTF-16le |
| 234 | ** encodings. |
| 235 | */ |
| 236 | #define UTF16BE_CR ((wchar_t)'\r') |
| 237 | #define UTF16BE_LF ((wchar_t)'\n') |
| 238 | #define UTF16LE_CR (((wchar_t)'\r')<<(sizeof(wchar_t)<<2)) |
| 239 | #define UTF16LE_LF (((wchar_t)'\n')<<(sizeof(wchar_t)<<2)) |
| 240 | |
| 241 | /* |
| 242 | ** This function attempts to scan each logical line within the blob to |
| 243 | ** determine the type of content it appears to contain. Possible return |
| 244 | ** values are: |
| 245 | ** |
| 246 | ** (1) -- The content appears to consist entirely of text, with lines |
| 247 | ** delimited by line-feed characters; however, the encoding may |
| 248 | ** not be UTF-16. |
| 249 | ** |
| 250 | ** (0) -- The content appears to be binary because it contains embedded |
| 251 | ** NUL characters or an extremely long line. Since this function |
| 252 | ** does not understand UTF-8, it may falsely consider UTF-8 text |
| 253 | ** to be binary. |
| 254 | ** |
| 255 | ** (-1) -- The content appears to consist entirely of text, with lines |
| 256 | ** delimited by carriage-return, line-feed pairs; however, the |
| 257 | ** encoding may not be UTF-16. |
| 258 | ** |
| 259 | */ |
| 260 | int looks_like_utf16(const Blob *pContent){ |
| 261 | const wchar_t *z = (wchar_t *)blob_buffer(pContent); |
| 262 | unsigned int n = blob_size(pContent); |
| 263 | int j, c; |
| 264 | int result = 1; /* Assume UTF-16 text with no CR/NL */ |
| 265 | |
| 266 | /* Check individual lines. |
| 267 | */ |
| 268 | if( n==0 ) return result; /* Empty file -> text */ |
| 269 | if( n%2 ) return 0; /* Odd number of bytes -> binary (or UTF-8) */ |
| 270 | c = *z; |
| 271 | if( c==0 ) return 0; /* NUL character in a file -> binary */ |
| 272 | j = ((c!=UTF16BE_LF) && (c!=UTF16LE_LF)); |
| 273 | while( (n-=2)>0 ){ |
| 274 | c = *++z; ++j; |
| 275 | if( c==0 ) return 0; /* NUL character in a file -> binary */ |
| 276 | if( c==UTF16BE_LF || c==UTF16LE_LF ){ |
| 277 | if( z[-1]==UTF16BE_CR || z[-1]==UTF16LE_CR ){ |
| 278 | result = -1; /* Contains CR/NL, continue */ |
| 279 | } |
| 280 | if( j>UTF16_LENGTH_MASK ){ |
| 281 | return 0; /* Very long line -> binary */ |
| 282 | } |
| 283 | j = 0; |
| 284 | } |
| 285 | } |
| 286 | if( j>UTF16_LENGTH_MASK ){ |
| 287 | return 0; /* Very long line -> binary */ |
| 288 | } |
| 289 | return result; /* No problems seen -> not binary */ |
| 290 | } |
| 291 | |
| 292 | /* |
| 293 |