Fossil SCM

Generate warning when to-be-committed file contains invalid UTF-8

jan.nijtmans 2012-11-02 10:55 trunk
Commit 4e86b06a9f03db12baffae8509741f5ebd8bcae9
2 files changed +3 -3 +66 -17
+3 -3
--- src/checkin.c
+++ src/checkin.c
@@ -895,11 +895,11 @@
895895
static int allOk = 0; /* Set to true to disable this routine */
896896
897897
if( allOk ) return;
898898
fUnicode = starts_with_utf16_bom(p);
899899
eType = fUnicode ? looks_like_utf16(p) : looks_like_utf8(p);
900
- if( eType==0 || eType==-1 || fUnicode ){
900
+ if( eType<0 || fUnicode ){
901901
const char *zWarning;
902902
Blob ans;
903903
char cReply;
904904
905905
if( eType==-1 && fUnicode ){
@@ -907,12 +907,12 @@
907907
}else if( eType==-1 ){
908908
if( crnlOk ){
909909
return; /* We don't want CR/NL warnings for this file. */
910910
}
911911
zWarning = "CR/NL line endings";
912
- }else if( eType==0 ){
913
- zWarning = "binary data";
912
+ }else if( eType==-2 ){
913
+ zWarning = "invalid UTF-8 or ASCII";
914914
}else{
915915
zWarning = "Unicode";
916916
}
917917
file_relative_name(zFilename, &fname, 0);
918918
blob_zero(&ans);
919919
--- src/checkin.c
+++ src/checkin.c
@@ -895,11 +895,11 @@
895 static int allOk = 0; /* Set to true to disable this routine */
896
897 if( allOk ) return;
898 fUnicode = starts_with_utf16_bom(p);
899 eType = fUnicode ? looks_like_utf16(p) : looks_like_utf8(p);
900 if( eType==0 || eType==-1 || fUnicode ){
901 const char *zWarning;
902 Blob ans;
903 char cReply;
904
905 if( eType==-1 && fUnicode ){
@@ -907,12 +907,12 @@
907 }else if( eType==-1 ){
908 if( crnlOk ){
909 return; /* We don't want CR/NL warnings for this file. */
910 }
911 zWarning = "CR/NL line endings";
912 }else if( eType==0 ){
913 zWarning = "binary data";
914 }else{
915 zWarning = "Unicode";
916 }
917 file_relative_name(zFilename, &fname, 0);
918 blob_zero(&ans);
919
--- src/checkin.c
+++ src/checkin.c
@@ -895,11 +895,11 @@
895 static int allOk = 0; /* Set to true to disable this routine */
896
897 if( allOk ) return;
898 fUnicode = starts_with_utf16_bom(p);
899 eType = fUnicode ? looks_like_utf16(p) : looks_like_utf8(p);
900 if( eType<0 || fUnicode ){
901 const char *zWarning;
902 Blob ans;
903 char cReply;
904
905 if( eType==-1 && fUnicode ){
@@ -907,12 +907,12 @@
907 }else if( eType==-1 ){
908 if( crnlOk ){
909 return; /* We don't want CR/NL warnings for this file. */
910 }
911 zWarning = "CR/NL line endings";
912 }else if( eType==-2 ){
913 zWarning = "invalid UTF-8 or ASCII";
914 }else{
915 zWarning = "Unicode";
916 }
917 file_relative_name(zFilename, &fname, 0);
918 blob_zero(&ans);
919
+66 -17
--- src/diff.c
+++ src/diff.c
@@ -175,47 +175,96 @@
175175
** This function attempts to scan each logical line within the blob to
176176
** determine the type of content it appears to contain. Possible return
177177
** values are:
178178
**
179179
** (1) -- The content appears to consist entirely of text, with lines
180
-** delimited by line-feed characters; however, the encoding may
181
-** not be UTF-8.
180
+** delimited by line-feed characters.
182181
**
183182
** (0) -- The content appears to be binary because it contains embedded
184183
** NUL characters or an extremely long line. Since this function
185184
** does not understand UTF-16, it may falsely consider UTF-16 text
186185
** to be binary.
187186
**
188187
** (-1) -- The content appears to consist entirely of text, with lines
189
-** delimited by carriage-return, line-feed pairs; however, the
190
-** encoding may not be UTF-8.
188
+** delimited by carriage-return, line-feed pairs.
189
+**
190
+** (-2) -- The content appears to consist entirely of text, with lines
191
+** delimited by line-feed characters or carriage-return,
192
+** line-feed pairs; however, the encoding is not UTF-8 or ASCII.
191193
**
192194
*/
195
+
193196
int looks_like_utf8(const Blob *pContent){
194
- const char *z = blob_buffer(pContent);
197
+ unsigned char *z = (unsigned char *) blob_buffer(pContent);
195198
unsigned int n = blob_size(pContent);
196
- int j, c;
199
+ unsigned int j;
200
+ unsigned char c;
197201
int result = 1; /* Assume UTF-8 text with no CR/NL */
198202
199203
/* Check individual lines.
200204
*/
201205
if( n==0 ) return result; /* Empty file -> text */
202206
c = *z;
203
- if( c==0 ) return 0; /* Zero byte in a file -> binary */
207
+ if( c<0x80 ){
208
+ if( c==0 ) return 0; /* Zero byte in a file -> binary */
209
+ }else if( c<0xC0 ){
210
+ result = -2; /* Invalid UTF-8, continue */
211
+ }else if( c<0xE0 ){
212
+ if( n<2 || ((z[1]&0xC0)!=0x80) ){
213
+ result = -2; /* Invalid 2-byte UTF-8, continue */
214
+ }else{
215
+ --n; ++z;
216
+ }
217
+ }else if( c<0xF0 ){
218
+ if( n<3 || ((z[1]&0xC0)!=0x80) || ((z[2]&0xC0)!=0x80) ){
219
+ result = -2; /* Invalid 3-byte UTF-8, continue */
220
+ }else{
221
+ n-=2; z+=2;
222
+ }
223
+ }else if( c<0xF8 ){
224
+ if( n<4 || ((z[1]&0xC0)!=0x80) || ((z[2]&0xC0)!=0x80) || ((z[3]&0xC0)!=0x80) ){
225
+ result = -2; /* Invalid 4-byte UTF-8, continue */
226
+ }else{
227
+ n-=3; z+=3;
228
+ }
229
+ }else{
230
+ result = -2; /* Invalid multi-byte UTF-8, continue */
231
+ }
204232
j = (c!='\n');
205233
while( --n>0 ){
206234
c = *++z; ++j;
207
- if( c==0 ) return 0; /* Zero byte in a file -> binary */
208
- if( c=='\n' ){
209
- int c2 = z[-1];
210
- if( c2=='\r' ){
211
- result = -1; /* Contains CR/NL, continue */
212
- }
213
- if( j>LENGTH_MASK ){
214
- return 0; /* Very long line -> binary */
215
- }
216
- j = 0;
235
+ if( c<0x80 ){
236
+ if( c==0 ) return 0; /* Zero byte in a file -> binary */
237
+ if( c=='\n' ){
238
+ unsigned char c2 = z[-1];
239
+ if( c2=='\r' && result>0 ){
240
+ result = -1; /* Contains CR/NL, continue */
241
+ }
242
+ if( j>LENGTH_MASK ){
243
+ return 0; /* Very long line -> binary */
244
+ }
245
+ j = 0;
246
+ }
247
+ }else if( c<0xC0 ){
248
+ result = -2; /* Invalid UTF-8, continue */
249
+ }else if( c<0xE0 ){
250
+ if( n<2 || ((z[1]&0xC0)!=0x80) ){
251
+ result = -2; continue; /* Invalid 2-byte UTF-8, continue */
252
+ }
253
+ --n; ++z;
254
+ }else if( c<0xF0 ){
255
+ if( n<3 || ((z[1]&0xC0)!=0x80) || ((z[2]&0xC0)!=0x80) ){
256
+ result = -2; continue; /* Invalid 3-byte UTF-8, continue */
257
+ }
258
+ n-=2; z+=2;
259
+ }else if( c<0xF8 ){
260
+ if( n<4 || ((z[1]&0xC0)!=0x80) || ((z[2]&0xC0)!=0x80) || ((z[3]&0xC0)!=0x80) ){
261
+ result = -2; continue; /* Invalid 4-byte UTF-8, continue */
262
+ }
263
+ n-=3; z+=3;
264
+ }else{
265
+ result = -2; /* Invalid multi-byte UTF-8, continue */
217266
}
218267
}
219268
if( j>LENGTH_MASK ){
220269
return 0; /* Very long line -> binary */
221270
}
222271
--- src/diff.c
+++ src/diff.c
@@ -175,47 +175,96 @@
175 ** This function attempts to scan each logical line within the blob to
176 ** determine the type of content it appears to contain. Possible return
177 ** values are:
178 **
179 ** (1) -- The content appears to consist entirely of text, with lines
180 ** delimited by line-feed characters; however, the encoding may
181 ** not be UTF-8.
182 **
183 ** (0) -- The content appears to be binary because it contains embedded
184 ** NUL characters or an extremely long line. Since this function
185 ** does not understand UTF-16, it may falsely consider UTF-16 text
186 ** to be binary.
187 **
188 ** (-1) -- The content appears to consist entirely of text, with lines
189 ** delimited by carriage-return, line-feed pairs; however, the
190 ** encoding may not be UTF-8.
 
 
 
191 **
192 */
 
193 int looks_like_utf8(const Blob *pContent){
194 const char *z = blob_buffer(pContent);
195 unsigned int n = blob_size(pContent);
196 int j, c;
 
197 int result = 1; /* Assume UTF-8 text with no CR/NL */
198
199 /* Check individual lines.
200 */
201 if( n==0 ) return result; /* Empty file -> text */
202 c = *z;
203 if( c==0 ) return 0; /* Zero byte in a file -> binary */
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
204 j = (c!='\n');
205 while( --n>0 ){
206 c = *++z; ++j;
207 if( c==0 ) return 0; /* Zero byte in a file -> binary */
208 if( c=='\n' ){
209 int c2 = z[-1];
210 if( c2=='\r' ){
211 result = -1; /* Contains CR/NL, continue */
212 }
213 if( j>LENGTH_MASK ){
214 return 0; /* Very long line -> binary */
215 }
216 j = 0;
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
217 }
218 }
219 if( j>LENGTH_MASK ){
220 return 0; /* Very long line -> binary */
221 }
222
--- src/diff.c
+++ src/diff.c
@@ -175,47 +175,96 @@
175 ** This function attempts to scan each logical line within the blob to
176 ** determine the type of content it appears to contain. Possible return
177 ** values are:
178 **
179 ** (1) -- The content appears to consist entirely of text, with lines
180 ** delimited by line-feed characters.
 
181 **
182 ** (0) -- The content appears to be binary because it contains embedded
183 ** NUL characters or an extremely long line. Since this function
184 ** does not understand UTF-16, it may falsely consider UTF-16 text
185 ** to be binary.
186 **
187 ** (-1) -- The content appears to consist entirely of text, with lines
188 ** delimited by carriage-return, line-feed pairs.
189 **
190 ** (-2) -- The content appears to consist entirely of text, with lines
191 ** delimited by line-feed characters or carriage-return,
192 ** line-feed pairs; however, the encoding is not UTF-8 or ASCII.
193 **
194 */
195
196 int looks_like_utf8(const Blob *pContent){
197 unsigned char *z = (unsigned char *) blob_buffer(pContent);
198 unsigned int n = blob_size(pContent);
199 unsigned int j;
200 unsigned char c;
201 int result = 1; /* Assume UTF-8 text with no CR/NL */
202
203 /* Check individual lines.
204 */
205 if( n==0 ) return result; /* Empty file -> text */
206 c = *z;
207 if( c<0x80 ){
208 if( c==0 ) return 0; /* Zero byte in a file -> binary */
209 }else if( c<0xC0 ){
210 result = -2; /* Invalid UTF-8, continue */
211 }else if( c<0xE0 ){
212 if( n<2 || ((z[1]&0xC0)!=0x80) ){
213 result = -2; /* Invalid 2-byte UTF-8, continue */
214 }else{
215 --n; ++z;
216 }
217 }else if( c<0xF0 ){
218 if( n<3 || ((z[1]&0xC0)!=0x80) || ((z[2]&0xC0)!=0x80) ){
219 result = -2; /* Invalid 3-byte UTF-8, continue */
220 }else{
221 n-=2; z+=2;
222 }
223 }else if( c<0xF8 ){
224 if( n<4 || ((z[1]&0xC0)!=0x80) || ((z[2]&0xC0)!=0x80) || ((z[3]&0xC0)!=0x80) ){
225 result = -2; /* Invalid 4-byte UTF-8, continue */
226 }else{
227 n-=3; z+=3;
228 }
229 }else{
230 result = -2; /* Invalid multi-byte UTF-8, continue */
231 }
232 j = (c!='\n');
233 while( --n>0 ){
234 c = *++z; ++j;
235 if( c<0x80 ){
236 if( c==0 ) return 0; /* Zero byte in a file -> binary */
237 if( c=='\n' ){
238 unsigned char c2 = z[-1];
239 if( c2=='\r' && result>0 ){
240 result = -1; /* Contains CR/NL, continue */
241 }
242 if( j>LENGTH_MASK ){
243 return 0; /* Very long line -> binary */
244 }
245 j = 0;
246 }
247 }else if( c<0xC0 ){
248 result = -2; /* Invalid UTF-8, continue */
249 }else if( c<0xE0 ){
250 if( n<2 || ((z[1]&0xC0)!=0x80) ){
251 result = -2; continue; /* Invalid 2-byte UTF-8, continue */
252 }
253 --n; ++z;
254 }else if( c<0xF0 ){
255 if( n<3 || ((z[1]&0xC0)!=0x80) || ((z[2]&0xC0)!=0x80) ){
256 result = -2; continue; /* Invalid 3-byte UTF-8, continue */
257 }
258 n-=2; z+=2;
259 }else if( c<0xF8 ){
260 if( n<4 || ((z[1]&0xC0)!=0x80) || ((z[2]&0xC0)!=0x80) || ((z[3]&0xC0)!=0x80) ){
261 result = -2; continue; /* Invalid 4-byte UTF-8, continue */
262 }
263 n-=3; z+=3;
264 }else{
265 result = -2; /* Invalid multi-byte UTF-8, continue */
266 }
267 }
268 if( j>LENGTH_MASK ){
269 return 0; /* Very long line -> binary */
270 }
271

Keyboard Shortcuts

Open search /
Next entry (timeline) j
Previous entry (timeline) k
Open focused entry Enter
Show this help ?
Toggle theme Top nav button