Fossil SCM
Give a warning when a to-be-committed text file has byte sequences which are not valid UTF-8. Can be disabled with "encoding-glob" setting.
Commit
0cb00c0b8f4e5b03112e5b805dfe21668bfa7374
Parent
85d2a1120e32a41…
1 file changed
+12
-1
+12
-1
| --- src/checkin.c | ||
| +++ src/checkin.c | ||
| @@ -1239,10 +1239,11 @@ | ||
| 1239 | 1239 | int fBinary; /* does the blob content appear to be binary? */ |
| 1240 | 1240 | int lookFlags; /* output flags from looks_like_utf8/utf16() */ |
| 1241 | 1241 | int fHasAnyCr; /* the blob contains one or more CR chars */ |
| 1242 | 1242 | int fHasLoneCrOnly; /* all detected line endings are CR only */ |
| 1243 | 1243 | int fHasCrLfOnly; /* all detected line endings are CR/LF pairs */ |
| 1244 | + int fHasInvalidUtf8 = 0;/* contains byte-sequence which is invalid for UTF-8 */ | |
| 1244 | 1245 | char *zMsg; /* Warning message */ |
| 1245 | 1246 | Blob fname; /* Relative pathname of the file */ |
| 1246 | 1247 | static int allOk = 0; /* Set to true to disable this routine */ |
| 1247 | 1248 | |
| 1248 | 1249 | if( allOk ) return 0; |
| @@ -1249,16 +1250,19 @@ | ||
| 1249 | 1250 | fUnicode = could_be_utf16(p, &bReverse); |
| 1250 | 1251 | if( fUnicode ){ |
| 1251 | 1252 | lookFlags = looks_like_utf16(p, bReverse, LOOK_NUL); |
| 1252 | 1253 | }else{ |
| 1253 | 1254 | lookFlags = looks_like_utf8(p, LOOK_NUL); |
| 1255 | + if( !(lookFlags & LOOK_BINARY) && invalid_utf8(p) ){ | |
| 1256 | + fHasInvalidUtf8 = 1; | |
| 1257 | + } | |
| 1254 | 1258 | } |
| 1255 | 1259 | fHasAnyCr = (lookFlags & LOOK_CR); |
| 1256 | 1260 | fBinary = (lookFlags & LOOK_BINARY); |
| 1257 | 1261 | fHasLoneCrOnly = ((lookFlags & LOOK_EOL) == LOOK_LONE_CR); |
| 1258 | 1262 | fHasCrLfOnly = ((lookFlags & LOOK_EOL) == LOOK_CRLF); |
| 1259 | - if( fUnicode || fHasAnyCr || fBinary ){ | |
| 1263 | + if( fUnicode || fHasAnyCr || fBinary || fHasInvalidUtf8){ | |
| 1260 | 1264 | const char *zWarning; |
| 1261 | 1265 | const char *zDisable; |
| 1262 | 1266 | const char *zConvert = "c=convert/"; |
| 1263 | 1267 | Blob ans; |
| 1264 | 1268 | char cReply; |
| @@ -1287,10 +1291,17 @@ | ||
| 1287 | 1291 | zWarning = "CR/NL line endings and Unicode"; |
| 1288 | 1292 | }else{ |
| 1289 | 1293 | zWarning = "mixed line endings and Unicode"; |
| 1290 | 1294 | } |
| 1291 | 1295 | zDisable = "\"crnl-glob\" and \"encoding-glob\" settings"; |
| 1296 | + }else if( fHasInvalidUtf8 ){ | |
| 1297 | + if( encodingOk ){ | |
| 1298 | + return 0; /* We don't want encoding warnings for this file. */ | |
| 1299 | + } | |
| 1300 | + zWarning = "invalid UTF-8"; | |
| 1301 | + zConvert = ""; /* Possible conversion to UTF-8 not yet implemented. */ | |
| 1302 | + zDisable = "\"encoding-glob\" setting"; | |
| 1292 | 1303 | }else if( fHasAnyCr ){ |
| 1293 | 1304 | if( crnlOk ){ |
| 1294 | 1305 | return 0; /* We don't want CR/NL warnings for this file. */ |
| 1295 | 1306 | } |
| 1296 | 1307 | if( fHasLoneCrOnly ){ |
| 1297 | 1308 |
| --- src/checkin.c | |
| +++ src/checkin.c | |
| @@ -1239,10 +1239,11 @@ | |
| 1239 | int fBinary; /* does the blob content appear to be binary? */ |
| 1240 | int lookFlags; /* output flags from looks_like_utf8/utf16() */ |
| 1241 | int fHasAnyCr; /* the blob contains one or more CR chars */ |
| 1242 | int fHasLoneCrOnly; /* all detected line endings are CR only */ |
| 1243 | int fHasCrLfOnly; /* all detected line endings are CR/LF pairs */ |
| 1244 | char *zMsg; /* Warning message */ |
| 1245 | Blob fname; /* Relative pathname of the file */ |
| 1246 | static int allOk = 0; /* Set to true to disable this routine */ |
| 1247 | |
| 1248 | if( allOk ) return 0; |
| @@ -1249,16 +1250,19 @@ | |
| 1249 | fUnicode = could_be_utf16(p, &bReverse); |
| 1250 | if( fUnicode ){ |
| 1251 | lookFlags = looks_like_utf16(p, bReverse, LOOK_NUL); |
| 1252 | }else{ |
| 1253 | lookFlags = looks_like_utf8(p, LOOK_NUL); |
| 1254 | } |
| 1255 | fHasAnyCr = (lookFlags & LOOK_CR); |
| 1256 | fBinary = (lookFlags & LOOK_BINARY); |
| 1257 | fHasLoneCrOnly = ((lookFlags & LOOK_EOL) == LOOK_LONE_CR); |
| 1258 | fHasCrLfOnly = ((lookFlags & LOOK_EOL) == LOOK_CRLF); |
| 1259 | if( fUnicode || fHasAnyCr || fBinary ){ |
| 1260 | const char *zWarning; |
| 1261 | const char *zDisable; |
| 1262 | const char *zConvert = "c=convert/"; |
| 1263 | Blob ans; |
| 1264 | char cReply; |
| @@ -1287,10 +1291,17 @@ | |
| 1287 | zWarning = "CR/NL line endings and Unicode"; |
| 1288 | }else{ |
| 1289 | zWarning = "mixed line endings and Unicode"; |
| 1290 | } |
| 1291 | zDisable = "\"crnl-glob\" and \"encoding-glob\" settings"; |
| 1292 | }else if( fHasAnyCr ){ |
| 1293 | if( crnlOk ){ |
| 1294 | return 0; /* We don't want CR/NL warnings for this file. */ |
| 1295 | } |
| 1296 | if( fHasLoneCrOnly ){ |
| 1297 |
| --- src/checkin.c | |
| +++ src/checkin.c | |
| @@ -1239,10 +1239,11 @@ | |
| 1239 | int fBinary; /* does the blob content appear to be binary? */ |
| 1240 | int lookFlags; /* output flags from looks_like_utf8/utf16() */ |
| 1241 | int fHasAnyCr; /* the blob contains one or more CR chars */ |
| 1242 | int fHasLoneCrOnly; /* all detected line endings are CR only */ |
| 1243 | int fHasCrLfOnly; /* all detected line endings are CR/LF pairs */ |
| 1244 | int fHasInvalidUtf8 = 0;/* contains byte-sequence which is invalid for UTF-8 */ |
| 1245 | char *zMsg; /* Warning message */ |
| 1246 | Blob fname; /* Relative pathname of the file */ |
| 1247 | static int allOk = 0; /* Set to true to disable this routine */ |
| 1248 | |
| 1249 | if( allOk ) return 0; |
| @@ -1249,16 +1250,19 @@ | |
| 1250 | fUnicode = could_be_utf16(p, &bReverse); |
| 1251 | if( fUnicode ){ |
| 1252 | lookFlags = looks_like_utf16(p, bReverse, LOOK_NUL); |
| 1253 | }else{ |
| 1254 | lookFlags = looks_like_utf8(p, LOOK_NUL); |
| 1255 | if( !(lookFlags & LOOK_BINARY) && invalid_utf8(p) ){ |
| 1256 | fHasInvalidUtf8 = 1; |
| 1257 | } |
| 1258 | } |
| 1259 | fHasAnyCr = (lookFlags & LOOK_CR); |
| 1260 | fBinary = (lookFlags & LOOK_BINARY); |
| 1261 | fHasLoneCrOnly = ((lookFlags & LOOK_EOL) == LOOK_LONE_CR); |
| 1262 | fHasCrLfOnly = ((lookFlags & LOOK_EOL) == LOOK_CRLF); |
| 1263 | if( fUnicode || fHasAnyCr || fBinary || fHasInvalidUtf8){ |
| 1264 | const char *zWarning; |
| 1265 | const char *zDisable; |
| 1266 | const char *zConvert = "c=convert/"; |
| 1267 | Blob ans; |
| 1268 | char cReply; |
| @@ -1287,10 +1291,17 @@ | |
| 1291 | zWarning = "CR/NL line endings and Unicode"; |
| 1292 | }else{ |
| 1293 | zWarning = "mixed line endings and Unicode"; |
| 1294 | } |
| 1295 | zDisable = "\"crnl-glob\" and \"encoding-glob\" settings"; |
| 1296 | }else if( fHasInvalidUtf8 ){ |
| 1297 | if( encodingOk ){ |
| 1298 | return 0; /* We don't want encoding warnings for this file. */ |
| 1299 | } |
| 1300 | zWarning = "invalid UTF-8"; |
| 1301 | zConvert = ""; /* Possible conversion to UTF-8 not yet implemented. */ |
| 1302 | zDisable = "\"encoding-glob\" setting"; |
| 1303 | }else if( fHasAnyCr ){ |
| 1304 | if( crnlOk ){ |
| 1305 | return 0; /* We don't want CR/NL warnings for this file. */ |
| 1306 | } |
| 1307 | if( fHasLoneCrOnly ){ |
| 1308 |