Fossil SCM

Adjustments to looks_like_utf16 to handle wchar_t being missing or not 2 bytes.

mistachkin 2012-11-02 17:22 trunk
Commit 7d881d82802ec8cf3f6fc38a35a1ed1fd1423560
1 file changed +43 -11
+43 -11
--- src/diff.c
+++ src/diff.c
@@ -187,10 +187,21 @@
187187
**
188188
** (-1) -- The content appears to consist entirely of text, with lines
189189
** delimited by carriage-return, line-feed pairs; however, the
190190
** encoding may not be UTF-8.
191191
**
192
+************************************ WARNING **********************************
193
+**
194
+** This function does not validate that the blob content is properly formed
195
+** UTF-8. It assumes that all code points are the same size. It does not
196
+** validate any code points. It makes no attempt to detect if any [invalid]
197
+** switches between UTF-8 and other encodings occur.
198
+**
199
+** The only code points that this function cares about are the NUL character,
200
+** carriage-return, and line-feed.
201
+**
202
+************************************ WARNING **********************************
192203
*/
193204
int looks_like_utf8(const Blob *pContent){
194205
const char *z = blob_buffer(pContent);
195206
unsigned int n = blob_size(pContent);
196207
int j, c;
@@ -221,26 +232,36 @@
221232
}
222233
return result; /* No problems seen -> not binary */
223234
}
224235
225236
/*
226
-** Maximum length of a line in a text file, in UTF-16 characters. (2731)
227
-** The number of bytes represented by this value after conversion to
228
-** UTF-8 (which can increase the size by 50%) cannot exceed LENGTH_MASK
237
+** Define the type needed to represent a Unicode (UTF-16) character.
238
+*/
239
+#ifndef WCHAR_T
240
+# ifdef _WIN32
241
+# define WCHAR_T wchar_t
242
+# else
243
+# define WCHAR_T unsigned short
244
+# endif
245
+#endif
246
+
247
+/*
248
+** Maximum length of a line in a text file, in UTF-16 characters. (4096)
249
+** The number of bytes represented by this value cannot exceed LENGTH_MASK
229250
** bytes, because that is the line buffer size used by the diff engine.
230251
*/
231
-#define UTF16_LENGTH_MASK (LENGTH_MASK/3)
252
+#define UTF16_LENGTH_MASK_SZ (LENGTH_MASK_SZ-(sizeof(WCHAR_T)-sizeof(char)))
253
+#define UTF16_LENGTH_MASK ((1<<UTF16_LENGTH_MASK_SZ)-1)
232254
233255
/*
234256
** The carriage-return / line-feed characters in the UTF-16be and UTF-16le
235257
** encodings.
236258
*/
237
-#define UTF16BE_CR ((wchar_t)'\r')
238
-#define UTF16BE_LF ((wchar_t)'\n')
239
-#define UTF16LE_CR (((wchar_t)'\r')<<(sizeof(wchar_t)<<2))
240
-#define UTF16LE_LF (((wchar_t)'\n')<<(sizeof(wchar_t)<<2))
241
-#define UTF16_FFFF ((wchar_t)-1)
259
+#define UTF16BE_CR ((WCHAR_T)'\r')
260
+#define UTF16BE_LF ((WCHAR_T)'\n')
261
+#define UTF16LE_CR (((WCHAR_T)'\r')<<(sizeof(char)<<3))
262
+#define UTF16LE_LF (((WCHAR_T)'\n')<<(sizeof(char)<<3))
242263
243264
/*
244265
** This function attempts to scan each logical line within the blob to
245266
** determine the type of content it appears to contain. Possible return
246267
** values are:
@@ -256,13 +277,24 @@
256277
**
257278
** (-1) -- The content appears to consist entirely of text, with lines
258279
** delimited by carriage-return, line-feed pairs; however, the
259280
** encoding may not be UTF-16.
260281
**
282
+************************************ WARNING **********************************
283
+**
284
+** This function does not validate that the blob content is properly formed
285
+** UTF-16. It assumes that all code points are the same size. It does not
286
+** validate any code points. It makes no attempt to detect if any [invalid]
287
+** switches between the UTF-16be and UTF-16le encodings occur.
288
+**
289
+** The only code points that this function cares about are the NUL character,
290
+** carriage-return, and line-feed.
291
+**
292
+************************************ WARNING **********************************
261293
*/
262294
int looks_like_utf16(const Blob *pContent){
263
- const wchar_t *z = (wchar_t *)blob_buffer(pContent);
295
+ const WCHAR_T *z = (WCHAR_T *)blob_buffer(pContent);
264296
unsigned int n = blob_size(pContent);
265297
int j, c;
266298
int result = 1; /* Assume UTF-16 text with no CR/NL */
267299
268300
/* Check individual lines.
@@ -272,11 +304,11 @@
272304
c = *z;
273305
if( c==0 ) return 0; /* NUL character in a file -> binary */
274306
j = ((c!=UTF16BE_LF) && (c!=UTF16LE_LF));
275307
while( (n-=2)>0 ){
276308
c = *++z; ++j;
277
- if( c==0 || c==UTF16_FFFF ) return 0; /* NUL/FFFF character in a file -> binary */
309
+ if( c==0 ) return 0; /* NUL character in a file -> binary */
278310
if( c==UTF16BE_LF || c==UTF16LE_LF ){
279311
int c2 = z[-1];
280312
if( c2==UTF16BE_CR || c2==UTF16LE_CR ){
281313
result = -1; /* Contains CR/NL, continue */
282314
}
283315
--- src/diff.c
+++ src/diff.c
@@ -187,10 +187,21 @@
187 **
188 ** (-1) -- The content appears to consist entirely of text, with lines
189 ** delimited by carriage-return, line-feed pairs; however, the
190 ** encoding may not be UTF-8.
191 **
 
 
 
 
 
 
 
 
 
 
 
192 */
193 int looks_like_utf8(const Blob *pContent){
194 const char *z = blob_buffer(pContent);
195 unsigned int n = blob_size(pContent);
196 int j, c;
@@ -221,26 +232,36 @@
221 }
222 return result; /* No problems seen -> not binary */
223 }
224
225 /*
226 ** Maximum length of a line in a text file, in UTF-16 characters. (2731)
227 ** The number of bytes represented by this value after conversion to
228 ** UTF-8 (which can increase the size by 50%) cannot exceed LENGTH_MASK
 
 
 
 
 
 
 
 
 
 
229 ** bytes, because that is the line buffer size used by the diff engine.
230 */
231 #define UTF16_LENGTH_MASK (LENGTH_MASK/3)
 
232
233 /*
234 ** The carriage-return / line-feed characters in the UTF-16be and UTF-16le
235 ** encodings.
236 */
237 #define UTF16BE_CR ((wchar_t)'\r')
238 #define UTF16BE_LF ((wchar_t)'\n')
239 #define UTF16LE_CR (((wchar_t)'\r')<<(sizeof(wchar_t)<<2))
240 #define UTF16LE_LF (((wchar_t)'\n')<<(sizeof(wchar_t)<<2))
241 #define UTF16_FFFF ((wchar_t)-1)
242
243 /*
244 ** This function attempts to scan each logical line within the blob to
245 ** determine the type of content it appears to contain. Possible return
246 ** values are:
@@ -256,13 +277,24 @@
256 **
257 ** (-1) -- The content appears to consist entirely of text, with lines
258 ** delimited by carriage-return, line-feed pairs; however, the
259 ** encoding may not be UTF-16.
260 **
 
 
 
 
 
 
 
 
 
 
 
261 */
262 int looks_like_utf16(const Blob *pContent){
263 const wchar_t *z = (wchar_t *)blob_buffer(pContent);
264 unsigned int n = blob_size(pContent);
265 int j, c;
266 int result = 1; /* Assume UTF-16 text with no CR/NL */
267
268 /* Check individual lines.
@@ -272,11 +304,11 @@
272 c = *z;
273 if( c==0 ) return 0; /* NUL character in a file -> binary */
274 j = ((c!=UTF16BE_LF) && (c!=UTF16LE_LF));
275 while( (n-=2)>0 ){
276 c = *++z; ++j;
277 if( c==0 || c==UTF16_FFFF ) return 0; /* NUL/FFFF character in a file -> binary */
278 if( c==UTF16BE_LF || c==UTF16LE_LF ){
279 int c2 = z[-1];
280 if( c2==UTF16BE_CR || c2==UTF16LE_CR ){
281 result = -1; /* Contains CR/NL, continue */
282 }
283
--- src/diff.c
+++ src/diff.c
@@ -187,10 +187,21 @@
187 **
188 ** (-1) -- The content appears to consist entirely of text, with lines
189 ** delimited by carriage-return, line-feed pairs; however, the
190 ** encoding may not be UTF-8.
191 **
192 ************************************ WARNING **********************************
193 **
194 ** This function does not validate that the blob content is properly formed
195 ** UTF-8. It assumes that all code points are the same size. It does not
196 ** validate any code points. It makes no attempt to detect if any [invalid]
197 ** switches between UTF-8 and other encodings occur.
198 **
199 ** The only code points that this function cares about are the NUL character,
200 ** carriage-return, and line-feed.
201 **
202 ************************************ WARNING **********************************
203 */
204 int looks_like_utf8(const Blob *pContent){
205 const char *z = blob_buffer(pContent);
206 unsigned int n = blob_size(pContent);
207 int j, c;
@@ -221,26 +232,36 @@
232 }
233 return result; /* No problems seen -> not binary */
234 }
235
236 /*
237 ** Define the type needed to represent a Unicode (UTF-16) character.
238 */
239 #ifndef WCHAR_T
240 # ifdef _WIN32
241 # define WCHAR_T wchar_t
242 # else
243 # define WCHAR_T unsigned short
244 # endif
245 #endif
246
247 /*
248 ** Maximum length of a line in a text file, in UTF-16 characters. (4096)
249 ** The number of bytes represented by this value cannot exceed LENGTH_MASK
250 ** bytes, because that is the line buffer size used by the diff engine.
251 */
252 #define UTF16_LENGTH_MASK_SZ (LENGTH_MASK_SZ-(sizeof(WCHAR_T)-sizeof(char)))
253 #define UTF16_LENGTH_MASK ((1<<UTF16_LENGTH_MASK_SZ)-1)
254
255 /*
256 ** The carriage-return / line-feed characters in the UTF-16be and UTF-16le
257 ** encodings.
258 */
259 #define UTF16BE_CR ((WCHAR_T)'\r')
260 #define UTF16BE_LF ((WCHAR_T)'\n')
261 #define UTF16LE_CR (((WCHAR_T)'\r')<<(sizeof(char)<<3))
262 #define UTF16LE_LF (((WCHAR_T)'\n')<<(sizeof(char)<<3))
 
263
264 /*
265 ** This function attempts to scan each logical line within the blob to
266 ** determine the type of content it appears to contain. Possible return
267 ** values are:
@@ -256,13 +277,24 @@
277 **
278 ** (-1) -- The content appears to consist entirely of text, with lines
279 ** delimited by carriage-return, line-feed pairs; however, the
280 ** encoding may not be UTF-16.
281 **
282 ************************************ WARNING **********************************
283 **
284 ** This function does not validate that the blob content is properly formed
285 ** UTF-16. It assumes that all code points are the same size. It does not
286 ** validate any code points. It makes no attempt to detect if any [invalid]
287 ** switches between the UTF-16be and UTF-16le encodings occur.
288 **
289 ** The only code points that this function cares about are the NUL character,
290 ** carriage-return, and line-feed.
291 **
292 ************************************ WARNING **********************************
293 */
294 int looks_like_utf16(const Blob *pContent){
295 const WCHAR_T *z = (WCHAR_T *)blob_buffer(pContent);
296 unsigned int n = blob_size(pContent);
297 int j, c;
298 int result = 1; /* Assume UTF-16 text with no CR/NL */
299
300 /* Check individual lines.
@@ -272,11 +304,11 @@
304 c = *z;
305 if( c==0 ) return 0; /* NUL character in a file -> binary */
306 j = ((c!=UTF16BE_LF) && (c!=UTF16LE_LF));
307 while( (n-=2)>0 ){
308 c = *++z; ++j;
309 if( c==0 ) return 0; /* NUL character in a file -> binary */
310 if( c==UTF16BE_LF || c==UTF16LE_LF ){
311 int c2 = z[-1];
312 if( c2==UTF16BE_CR || c2==UTF16LE_CR ){
313 result = -1; /* Contains CR/NL, continue */
314 }
315

Keyboard Shortcuts

Open search /
Next entry (timeline) j
Previous entry (timeline) k
Open focused entry Enter
Show this help ?
Toggle theme Top nav button