Fossil SCM
Adjustments to looks_like_utf16 to handle wchar_t being missing or not 2 bytes.
Commit
7d881d82802ec8cf3f6fc38a35a1ed1fd1423560
Parent
d804902f2333e41…
1 file changed
+43
-11
+43
-11
| --- src/diff.c | ||
| +++ src/diff.c | ||
| @@ -187,10 +187,21 @@ | ||
| 187 | 187 | ** |
| 188 | 188 | ** (-1) -- The content appears to consist entirely of text, with lines |
| 189 | 189 | ** delimited by carriage-return, line-feed pairs; however, the |
| 190 | 190 | ** encoding may not be UTF-8. |
| 191 | 191 | ** |
| 192 | +************************************ WARNING ********************************** | |
| 193 | +** | |
| 194 | +** This function does not validate that the blob content is properly formed | |
| 195 | +** UTF-8. It assumes that all code points are the same size. It does not | |
| 196 | +** validate any code points. It makes no attempt to detect if any [invalid] | |
| 197 | +** switches between UTF-8 and other encodings occur. | |
| 198 | +** | |
| 199 | +** The only code points that this function cares about are the NUL character, | |
| 200 | +** carriage-return, and line-feed. | |
| 201 | +** | |
| 202 | +************************************ WARNING ********************************** | |
| 192 | 203 | */ |
| 193 | 204 | int looks_like_utf8(const Blob *pContent){ |
| 194 | 205 | const char *z = blob_buffer(pContent); |
| 195 | 206 | unsigned int n = blob_size(pContent); |
| 196 | 207 | int j, c; |
| @@ -221,26 +232,36 @@ | ||
| 221 | 232 | } |
| 222 | 233 | return result; /* No problems seen -> not binary */ |
| 223 | 234 | } |
| 224 | 235 | |
| 225 | 236 | /* |
| 226 | -** Maximum length of a line in a text file, in UTF-16 characters. (2731) | |
| 227 | -** The number of bytes represented by this value after conversion to | |
| 228 | -** UTF-8 (which can increase the size by 50%) cannot exceed LENGTH_MASK | |
| 237 | +** Define the type needed to represent a Unicode (UTF-16) character. | |
| 238 | +*/ | |
| 239 | +#ifndef WCHAR_T | |
| 240 | +# ifdef _WIN32 | |
| 241 | +# define WCHAR_T wchar_t | |
| 242 | +# else | |
| 243 | +# define WCHAR_T unsigned short | |
| 244 | +# endif | |
| 245 | +#endif | |
| 246 | + | |
| 247 | +/* | |
| 248 | +** Maximum length of a line in a text file, in UTF-16 characters. (4096) | |
| 249 | +** The number of bytes represented by this value cannot exceed LENGTH_MASK | |
| 229 | 250 | ** bytes, because that is the line buffer size used by the diff engine. |
| 230 | 251 | */ |
| 231 | -#define UTF16_LENGTH_MASK (LENGTH_MASK/3) | |
| 252 | +#define UTF16_LENGTH_MASK_SZ (LENGTH_MASK_SZ-(sizeof(WCHAR_T)-sizeof(char))) | |
| 253 | +#define UTF16_LENGTH_MASK ((1<<UTF16_LENGTH_MASK_SZ)-1) | |
| 232 | 254 | |
| 233 | 255 | /* |
| 234 | 256 | ** The carriage-return / line-feed characters in the UTF-16be and UTF-16le |
| 235 | 257 | ** encodings. |
| 236 | 258 | */ |
| 237 | -#define UTF16BE_CR ((wchar_t)'\r') | |
| 238 | -#define UTF16BE_LF ((wchar_t)'\n') | |
| 239 | -#define UTF16LE_CR (((wchar_t)'\r')<<(sizeof(wchar_t)<<2)) | |
| 240 | -#define UTF16LE_LF (((wchar_t)'\n')<<(sizeof(wchar_t)<<2)) | |
| 241 | -#define UTF16_FFFF ((wchar_t)-1) | |
| 259 | +#define UTF16BE_CR ((WCHAR_T)'\r') | |
| 260 | +#define UTF16BE_LF ((WCHAR_T)'\n') | |
| 261 | +#define UTF16LE_CR (((WCHAR_T)'\r')<<(sizeof(char)<<3)) | |
| 262 | +#define UTF16LE_LF (((WCHAR_T)'\n')<<(sizeof(char)<<3)) | |
| 242 | 263 | |
| 243 | 264 | /* |
| 244 | 265 | ** This function attempts to scan each logical line within the blob to |
| 245 | 266 | ** determine the type of content it appears to contain. Possible return |
| 246 | 267 | ** values are: |
| @@ -256,13 +277,24 @@ | ||
| 256 | 277 | ** |
| 257 | 278 | ** (-1) -- The content appears to consist entirely of text, with lines |
| 258 | 279 | ** delimited by carriage-return, line-feed pairs; however, the |
| 259 | 280 | ** encoding may not be UTF-16. |
| 260 | 281 | ** |
| 282 | +************************************ WARNING ********************************** | |
| 283 | +** | |
| 284 | +** This function does not validate that the blob content is properly formed | |
| 285 | +** UTF-16. It assumes that all code points are the same size. It does not | |
| 286 | +** validate any code points. It makes no attempt to detect if any [invalid] | |
| 287 | +** switches between the UTF-16be and UTF-16le encodings occur. | |
| 288 | +** | |
| 289 | +** The only code points that this function cares about are the NUL character, | |
| 290 | +** carriage-return, and line-feed. | |
| 291 | +** | |
| 292 | +************************************ WARNING ********************************** | |
| 261 | 293 | */ |
| 262 | 294 | int looks_like_utf16(const Blob *pContent){ |
| 263 | - const wchar_t *z = (wchar_t *)blob_buffer(pContent); | |
| 295 | + const WCHAR_T *z = (WCHAR_T *)blob_buffer(pContent); | |
| 264 | 296 | unsigned int n = blob_size(pContent); |
| 265 | 297 | int j, c; |
| 266 | 298 | int result = 1; /* Assume UTF-16 text with no CR/NL */ |
| 267 | 299 | |
| 268 | 300 | /* Check individual lines. |
| @@ -272,11 +304,11 @@ | ||
| 272 | 304 | c = *z; |
| 273 | 305 | if( c==0 ) return 0; /* NUL character in a file -> binary */ |
| 274 | 306 | j = ((c!=UTF16BE_LF) && (c!=UTF16LE_LF)); |
| 275 | 307 | while( (n-=2)>0 ){ |
| 276 | 308 | c = *++z; ++j; |
| 277 | - if( c==0 || c==UTF16_FFFF ) return 0; /* NUL/FFFF character in a file -> binary */ | |
| 309 | + if( c==0 ) return 0; /* NUL character in a file -> binary */ | |
| 278 | 310 | if( c==UTF16BE_LF || c==UTF16LE_LF ){ |
| 279 | 311 | int c2 = z[-1]; |
| 280 | 312 | if( c2==UTF16BE_CR || c2==UTF16LE_CR ){ |
| 281 | 313 | result = -1; /* Contains CR/NL, continue */ |
| 282 | 314 | } |
| 283 | 315 |
| --- src/diff.c | |
| +++ src/diff.c | |
| @@ -187,10 +187,21 @@ | |
| 187 | ** |
| 188 | ** (-1) -- The content appears to consist entirely of text, with lines |
| 189 | ** delimited by carriage-return, line-feed pairs; however, the |
| 190 | ** encoding may not be UTF-8. |
| 191 | ** |
| 192 | */ |
| 193 | int looks_like_utf8(const Blob *pContent){ |
| 194 | const char *z = blob_buffer(pContent); |
| 195 | unsigned int n = blob_size(pContent); |
| 196 | int j, c; |
| @@ -221,26 +232,36 @@ | |
| 221 | } |
| 222 | return result; /* No problems seen -> not binary */ |
| 223 | } |
| 224 | |
| 225 | /* |
| 226 | ** Maximum length of a line in a text file, in UTF-16 characters. (2731) |
| 227 | ** The number of bytes represented by this value after conversion to |
| 228 | ** UTF-8 (which can increase the size by 50%) cannot exceed LENGTH_MASK |
| 229 | ** bytes, because that is the line buffer size used by the diff engine. |
| 230 | */ |
| 231 | #define UTF16_LENGTH_MASK (LENGTH_MASK/3) |
| 232 | |
| 233 | /* |
| 234 | ** The carriage-return / line-feed characters in the UTF-16be and UTF-16le |
| 235 | ** encodings. |
| 236 | */ |
| 237 | #define UTF16BE_CR ((wchar_t)'\r') |
| 238 | #define UTF16BE_LF ((wchar_t)'\n') |
| 239 | #define UTF16LE_CR (((wchar_t)'\r')<<(sizeof(wchar_t)<<2)) |
| 240 | #define UTF16LE_LF (((wchar_t)'\n')<<(sizeof(wchar_t)<<2)) |
| 241 | #define UTF16_FFFF ((wchar_t)-1) |
| 242 | |
| 243 | /* |
| 244 | ** This function attempts to scan each logical line within the blob to |
| 245 | ** determine the type of content it appears to contain. Possible return |
| 246 | ** values are: |
| @@ -256,13 +277,24 @@ | |
| 256 | ** |
| 257 | ** (-1) -- The content appears to consist entirely of text, with lines |
| 258 | ** delimited by carriage-return, line-feed pairs; however, the |
| 259 | ** encoding may not be UTF-16. |
| 260 | ** |
| 261 | */ |
| 262 | int looks_like_utf16(const Blob *pContent){ |
| 263 | const wchar_t *z = (wchar_t *)blob_buffer(pContent); |
| 264 | unsigned int n = blob_size(pContent); |
| 265 | int j, c; |
| 266 | int result = 1; /* Assume UTF-16 text with no CR/NL */ |
| 267 | |
| 268 | /* Check individual lines. |
| @@ -272,11 +304,11 @@ | |
| 272 | c = *z; |
| 273 | if( c==0 ) return 0; /* NUL character in a file -> binary */ |
| 274 | j = ((c!=UTF16BE_LF) && (c!=UTF16LE_LF)); |
| 275 | while( (n-=2)>0 ){ |
| 276 | c = *++z; ++j; |
| 277 | if( c==0 || c==UTF16_FFFF ) return 0; /* NUL/FFFF character in a file -> binary */ |
| 278 | if( c==UTF16BE_LF || c==UTF16LE_LF ){ |
| 279 | int c2 = z[-1]; |
| 280 | if( c2==UTF16BE_CR || c2==UTF16LE_CR ){ |
| 281 | result = -1; /* Contains CR/NL, continue */ |
| 282 | } |
| 283 |
| --- src/diff.c | |
| +++ src/diff.c | |
| @@ -187,10 +187,21 @@ | |
| 187 | ** |
| 188 | ** (-1) -- The content appears to consist entirely of text, with lines |
| 189 | ** delimited by carriage-return, line-feed pairs; however, the |
| 190 | ** encoding may not be UTF-8. |
| 191 | ** |
| 192 | ************************************ WARNING ********************************** |
| 193 | ** |
| 194 | ** This function does not validate that the blob content is properly formed |
| 195 | ** UTF-8. It assumes that all code points are the same size. It does not |
| 196 | ** validate any code points. It makes no attempt to detect if any [invalid] |
| 197 | ** switches between UTF-8 and other encodings occur. |
| 198 | ** |
| 199 | ** The only code points that this function cares about are the NUL character, |
| 200 | ** carriage-return, and line-feed. |
| 201 | ** |
| 202 | ************************************ WARNING ********************************** |
| 203 | */ |
| 204 | int looks_like_utf8(const Blob *pContent){ |
| 205 | const char *z = blob_buffer(pContent); |
| 206 | unsigned int n = blob_size(pContent); |
| 207 | int j, c; |
| @@ -221,26 +232,36 @@ | |
| 232 | } |
| 233 | return result; /* No problems seen -> not binary */ |
| 234 | } |
| 235 | |
| 236 | /* |
| 237 | ** Define the type needed to represent a Unicode (UTF-16) character. |
| 238 | */ |
| 239 | #ifndef WCHAR_T |
| 240 | # ifdef _WIN32 |
| 241 | # define WCHAR_T wchar_t |
| 242 | # else |
| 243 | # define WCHAR_T unsigned short |
| 244 | # endif |
| 245 | #endif |
| 246 | |
| 247 | /* |
| 248 | ** Maximum length of a line in a text file, in UTF-16 characters. (4096) |
| 249 | ** The number of bytes represented by this value cannot exceed LENGTH_MASK |
| 250 | ** bytes, because that is the line buffer size used by the diff engine. |
| 251 | */ |
| 252 | #define UTF16_LENGTH_MASK_SZ (LENGTH_MASK_SZ-(sizeof(WCHAR_T)-sizeof(char))) |
| 253 | #define UTF16_LENGTH_MASK ((1<<UTF16_LENGTH_MASK_SZ)-1) |
| 254 | |
| 255 | /* |
| 256 | ** The carriage-return / line-feed characters in the UTF-16be and UTF-16le |
| 257 | ** encodings. |
| 258 | */ |
| 259 | #define UTF16BE_CR ((WCHAR_T)'\r') |
| 260 | #define UTF16BE_LF ((WCHAR_T)'\n') |
| 261 | #define UTF16LE_CR (((WCHAR_T)'\r')<<(sizeof(char)<<3)) |
| 262 | #define UTF16LE_LF (((WCHAR_T)'\n')<<(sizeof(char)<<3)) |
| 263 | |
| 264 | /* |
| 265 | ** This function attempts to scan each logical line within the blob to |
| 266 | ** determine the type of content it appears to contain. Possible return |
| 267 | ** values are: |
| @@ -256,13 +277,24 @@ | |
| 277 | ** |
| 278 | ** (-1) -- The content appears to consist entirely of text, with lines |
| 279 | ** delimited by carriage-return, line-feed pairs; however, the |
| 280 | ** encoding may not be UTF-16. |
| 281 | ** |
| 282 | ************************************ WARNING ********************************** |
| 283 | ** |
| 284 | ** This function does not validate that the blob content is properly formed |
| 285 | ** UTF-16. It assumes that all code points are the same size. It does not |
| 286 | ** validate any code points. It makes no attempt to detect if any [invalid] |
| 287 | ** switches between the UTF-16be and UTF-16le encodings occur. |
| 288 | ** |
| 289 | ** The only code points that this function cares about are the NUL character, |
| 290 | ** carriage-return, and line-feed. |
| 291 | ** |
| 292 | ************************************ WARNING ********************************** |
| 293 | */ |
| 294 | int looks_like_utf16(const Blob *pContent){ |
| 295 | const WCHAR_T *z = (WCHAR_T *)blob_buffer(pContent); |
| 296 | unsigned int n = blob_size(pContent); |
| 297 | int j, c; |
| 298 | int result = 1; /* Assume UTF-16 text with no CR/NL */ |
| 299 | |
| 300 | /* Check individual lines. |
| @@ -272,11 +304,11 @@ | |
| 304 | c = *z; |
| 305 | if( c==0 ) return 0; /* NUL character in a file -> binary */ |
| 306 | j = ((c!=UTF16BE_LF) && (c!=UTF16LE_LF)); |
| 307 | while( (n-=2)>0 ){ |
| 308 | c = *++z; ++j; |
| 309 | if( c==0 ) return 0; /* NUL character in a file -> binary */ |
| 310 | if( c==UTF16BE_LF || c==UTF16LE_LF ){ |
| 311 | int c2 = z[-1]; |
| 312 | if( c2==UTF16BE_CR || c2==UTF16LE_CR ){ |
| 313 | result = -1; /* Contains CR/NL, continue */ |
| 314 | } |
| 315 |