Fossil SCM

Added the --verbose|v flag to the test-looks-like-utf8 command so that one can find out why a command like "fossil ci" is considering an input known to be text as "binary", then fix it.

wyoung 2025-08-01 04:50 trunk
Commit cf6c15bb285ef2cc23e580faa0c70b2643a559549a073aca732a58df4ce009bb
+1 -1
--- src/checkin.c
+++ src/checkin.c
@@ -2096,11 +2096,11 @@
20962096
if( sizeOk ){
20972097
fUnicode = could_be_utf16(pContent, &bReverse);
20982098
if( fUnicode ){
20992099
lookFlags = looks_like_utf16(pContent, bReverse, LOOK_NUL);
21002100
}else{
2101
- lookFlags = looks_like_utf8(pContent, LOOK_NUL);
2101
+ lookFlags = looks_like_utf8(pContent, LOOK_NUL, 0);
21022102
if( !(lookFlags & LOOK_BINARY) && invalid_utf8(pContent) ){
21032103
fHasInvalidUtf8 = 1;
21042104
}
21052105
}
21062106
fHasAnyCr = (lookFlags & LOOK_CR);
21072107
--- src/checkin.c
+++ src/checkin.c
@@ -2096,11 +2096,11 @@
2096 if( sizeOk ){
2097 fUnicode = could_be_utf16(pContent, &bReverse);
2098 if( fUnicode ){
2099 lookFlags = looks_like_utf16(pContent, bReverse, LOOK_NUL);
2100 }else{
2101 lookFlags = looks_like_utf8(pContent, LOOK_NUL);
2102 if( !(lookFlags & LOOK_BINARY) && invalid_utf8(pContent) ){
2103 fHasInvalidUtf8 = 1;
2104 }
2105 }
2106 fHasAnyCr = (lookFlags & LOOK_CR);
2107
--- src/checkin.c
+++ src/checkin.c
@@ -2096,11 +2096,11 @@
2096 if( sizeOk ){
2097 fUnicode = could_be_utf16(pContent, &bReverse);
2098 if( fUnicode ){
2099 lookFlags = looks_like_utf16(pContent, bReverse, LOOK_NUL);
2100 }else{
2101 lookFlags = looks_like_utf8(pContent, LOOK_NUL, 0);
2102 if( !(lookFlags & LOOK_BINARY) && invalid_utf8(pContent) ){
2103 fHasInvalidUtf8 = 1;
2104 }
2105 }
2106 fHasAnyCr = (lookFlags & LOOK_CR);
2107
+2 -2
--- src/fileedit.c
+++ src/fileedit.c
@@ -577,19 +577,19 @@
577577
**
578578
** https://html.spec.whatwg.org/#the-textarea-element
579579
*/
580580
const int pseudoBinary = LOOK_LONG | LOOK_NUL;
581581
const int lookFlags = LOOK_CRLF | LOOK_LONE_LF | pseudoBinary;
582
- const int lookNew = looks_like_utf8( &pCI->fileContent, lookFlags );
582
+ const int lookNew = looks_like_utf8( &pCI->fileContent, lookFlags, 0 );
583583
if(!(pseudoBinary & lookNew)){
584584
int rehash = 0;
585585
/*fossil_print("lookNew=%08x\n",lookNew);*/
586586
if(CIMINI_CONVERT_EOL_INHERIT & pCI->flags){
587587
Blob contentPrev = empty_blob;
588588
int lookOrig, nOrig;
589589
content_get(prevFRid, &contentPrev);
590
- lookOrig = looks_like_utf8(&contentPrev, lookFlags);
590
+ lookOrig = looks_like_utf8(&contentPrev, lookFlags, 0);
591591
nOrig = blob_size(&contentPrev);
592592
blob_reset(&contentPrev);
593593
/*fossil_print("lookOrig=%08x\n",lookOrig);*/
594594
if(nOrig>0 && lookOrig!=lookNew){
595595
/* If there is a newline-style mismatch, adjust the new
596596
--- src/fileedit.c
+++ src/fileedit.c
@@ -577,19 +577,19 @@
577 **
578 ** https://html.spec.whatwg.org/#the-textarea-element
579 */
580 const int pseudoBinary = LOOK_LONG | LOOK_NUL;
581 const int lookFlags = LOOK_CRLF | LOOK_LONE_LF | pseudoBinary;
582 const int lookNew = looks_like_utf8( &pCI->fileContent, lookFlags );
583 if(!(pseudoBinary & lookNew)){
584 int rehash = 0;
585 /*fossil_print("lookNew=%08x\n",lookNew);*/
586 if(CIMINI_CONVERT_EOL_INHERIT & pCI->flags){
587 Blob contentPrev = empty_blob;
588 int lookOrig, nOrig;
589 content_get(prevFRid, &contentPrev);
590 lookOrig = looks_like_utf8(&contentPrev, lookFlags);
591 nOrig = blob_size(&contentPrev);
592 blob_reset(&contentPrev);
593 /*fossil_print("lookOrig=%08x\n",lookOrig);*/
594 if(nOrig>0 && lookOrig!=lookNew){
595 /* If there is a newline-style mismatch, adjust the new
596
--- src/fileedit.c
+++ src/fileedit.c
@@ -577,19 +577,19 @@
577 **
578 ** https://html.spec.whatwg.org/#the-textarea-element
579 */
580 const int pseudoBinary = LOOK_LONG | LOOK_NUL;
581 const int lookFlags = LOOK_CRLF | LOOK_LONE_LF | pseudoBinary;
582 const int lookNew = looks_like_utf8( &pCI->fileContent, lookFlags, 0 );
583 if(!(pseudoBinary & lookNew)){
584 int rehash = 0;
585 /*fossil_print("lookNew=%08x\n",lookNew);*/
586 if(CIMINI_CONVERT_EOL_INHERIT & pCI->flags){
587 Blob contentPrev = empty_blob;
588 int lookOrig, nOrig;
589 content_get(prevFRid, &contentPrev);
590 lookOrig = looks_like_utf8(&contentPrev, lookFlags, 0);
591 nOrig = blob_size(&contentPrev);
592 blob_reset(&contentPrev);
593 /*fossil_print("lookOrig=%08x\n",lookOrig);*/
594 if(nOrig>0 && lookOrig!=lookNew){
595 /* If there is a newline-style mismatch, adjust the new
596
+25 -3
--- src/lookslike.c
+++ src/lookslike.c
@@ -29,11 +29,11 @@
2929
/*
3030
** This macro is designed to return non-zero if the specified blob contains
3131
** data that MAY be binary in nature; otherwise, zero will be returned.
3232
*/
3333
#define looks_like_binary(blob) \
34
- ((looks_like_utf8((blob), LOOK_BINARY) & LOOK_BINARY) != LOOK_NONE)
34
+ ((looks_like_utf8((blob), LOOK_BINARY, 0) & LOOK_BINARY) != LOOK_NONE)
3535
3636
/*
3737
** Output flags for the looks_like_utf8() and looks_like_utf16() routines used
3838
** to convey status information about the blob content.
3939
*/
@@ -114,46 +114,66 @@
114114
** This function examines the contents of the blob until one of the flags
115115
** specified in "stopFlags" is set.
116116
**
117117
************************************ WARNING **********************************
118118
*/
119
-int looks_like_utf8(const Blob *pContent, int stopFlags){
119
+int looks_like_utf8(const Blob *pContent, int stopFlags, int fVerbose){
120120
const char *z = blob_buffer(pContent);
121121
unsigned int n = blob_size(pContent);
122122
int j, c, flags = LOOK_NONE; /* Assume UTF-8 text, prove otherwise */
123
+ int nLine = 1;
123124
124125
if( n==0 ) return flags; /* Empty file -> text */
125126
c = *z;
126127
if( c==0 ){
127128
flags |= LOOK_NUL; /* NUL character in a file -> binary */
129
+ if( fVerbose ) fossil_print("NUL at start\n");
128130
}else if( c=='\r' ){
129131
flags |= LOOK_CR;
132
+ if( fVerbose ) fossil_print("CR at start\n");
130133
if( n<=1 || z[1]!='\n' ){
131134
flags |= LOOK_LONE_CR; /* Not enough chars or next char not LF */
135
+ if( fVerbose ) fossil_print("Lone CR at start\n");
132136
}
133137
}
134138
j = (c!='\n');
135139
if( !j ) flags |= (LOOK_LF | LOOK_LONE_LF); /* Found LF as first char */
136140
while( !(flags&stopFlags) && --n>0 ){
137141
int c2 = c;
138142
c = *++z; ++j;
139143
if( c==0 ){
144
+ if( fVerbose && !(flags&LOOK_NUL) ){
145
+ fossil_print("NUL on line %d\n", nLine);
146
+ }
140147
flags |= LOOK_NUL; /* NUL character in a file -> binary */
141148
}else if( c=='\n' ){
142149
flags |= LOOK_LF;
143150
if( c2=='\r' ){
151
+ if( fVerbose && !(flags&LOOK_CRLF) ){
152
+ fossil_print("CRLF on line %d\n", nLine);
153
+ }
144154
flags |= (LOOK_CR | LOOK_CRLF); /* Found LF preceded by CR */
145155
}else{
156
+ if( fVerbose && !(flags&LOOK_LONE_LF) ){
157
+ fossil_print("Lone LF on line %d\n", nLine);
158
+ }
146159
flags |= LOOK_LONE_LF;
147160
}
148161
if( j>LENGTH_MASK ){
162
+ if( fVerbose && !(flags&LOOK_LONG) ){
163
+ fossil_print("Line %d is longer than %d bytes\n", nLine, j);
164
+ }
149165
flags |= LOOK_LONG; /* Very long line -> binary */
150166
}
167
+ ++nLine;
151168
j = 0;
152169
}else if( c=='\r' ){
153170
flags |= LOOK_CR;
154171
if( n<=1 || z[1]!='\n' ){
172
+ if( fVerbose && !(flags&LOOK_LONE_CR) ){
173
+ fossil_print("Lone CR on line %d\n", nLine);
174
+ }
155175
flags |= LOOK_LONE_CR; /* Not enough chars or next char not LF */
156176
}
157177
}
158178
}
159179
if( n ){
@@ -404,10 +424,11 @@
404424
** Options:
405425
** -n|--limit N Repeat looks-like function N times, for
406426
** performance measurement. Default = 1
407427
** --utf8 Ignoring BOM and file size, force UTF-8 checking
408428
** --utf16 Ignoring BOM and file size, force UTF-16 checking
429
+** -v|--verbose Report the line numbers where each flag is first set
409430
**
410431
** FILENAME is the name of a file to check for textual content in the UTF-8
411432
** and/or UTF-16 encodings.
412433
*/
413434
void looks_like_utf_test_cmd(void){
@@ -418,10 +439,11 @@
418439
int lookFlags = 0; /* output flags from looks_like_utf8/utf16() */
419440
int bRevUtf16 = 0; /* non-zero -> UTF-16 byte order reversed */
420441
int fForceUtf8 = find_option("utf8",0,0)!=0;
421442
int fForceUtf16 = find_option("utf16",0,0)!=0;
422443
const char *zCount = find_option("limit","n",1);
444
+ int fVerbose = find_option("verbose","v",0)!=0;
423445
int nRepeat = 1;
424446
425447
if( g.argc!=3 ) usage("FILENAME");
426448
if( zCount ){
427449
nRepeat = atoi(zCount);
@@ -436,11 +458,11 @@
436458
fUnicode = could_be_utf16(&blob, 0) || fForceUtf16;
437459
}
438460
if( fUnicode ){
439461
lookFlags = looks_like_utf16(&blob, bRevUtf16, 0);
440462
}else{
441
- lookFlags = looks_like_utf8(&blob, 0) | invalid_utf8(&blob);
463
+ lookFlags = looks_like_utf8(&blob, 0, fVerbose) | invalid_utf8(&blob);
442464
}
443465
}
444466
fossil_print("File \"%s\" has %d bytes.\n",g.argv[2],blob_size(&blob));
445467
fossil_print("Starts with UTF-8 BOM: %s\n",fUtf8?"yes":"no");
446468
fossil_print("Starts with UTF-16 BOM: %s\n",
447469
--- src/lookslike.c
+++ src/lookslike.c
@@ -29,11 +29,11 @@
29 /*
30 ** This macro is designed to return non-zero if the specified blob contains
31 ** data that MAY be binary in nature; otherwise, zero will be returned.
32 */
33 #define looks_like_binary(blob) \
34 ((looks_like_utf8((blob), LOOK_BINARY) & LOOK_BINARY) != LOOK_NONE)
35
36 /*
37 ** Output flags for the looks_like_utf8() and looks_like_utf16() routines used
38 ** to convey status information about the blob content.
39 */
@@ -114,46 +114,66 @@
114 ** This function examines the contents of the blob until one of the flags
115 ** specified in "stopFlags" is set.
116 **
117 ************************************ WARNING **********************************
118 */
119 int looks_like_utf8(const Blob *pContent, int stopFlags){
120 const char *z = blob_buffer(pContent);
121 unsigned int n = blob_size(pContent);
122 int j, c, flags = LOOK_NONE; /* Assume UTF-8 text, prove otherwise */
 
123
124 if( n==0 ) return flags; /* Empty file -> text */
125 c = *z;
126 if( c==0 ){
127 flags |= LOOK_NUL; /* NUL character in a file -> binary */
 
128 }else if( c=='\r' ){
129 flags |= LOOK_CR;
 
130 if( n<=1 || z[1]!='\n' ){
131 flags |= LOOK_LONE_CR; /* Not enough chars or next char not LF */
 
132 }
133 }
134 j = (c!='\n');
135 if( !j ) flags |= (LOOK_LF | LOOK_LONE_LF); /* Found LF as first char */
136 while( !(flags&stopFlags) && --n>0 ){
137 int c2 = c;
138 c = *++z; ++j;
139 if( c==0 ){
 
 
 
140 flags |= LOOK_NUL; /* NUL character in a file -> binary */
141 }else if( c=='\n' ){
142 flags |= LOOK_LF;
143 if( c2=='\r' ){
 
 
 
144 flags |= (LOOK_CR | LOOK_CRLF); /* Found LF preceded by CR */
145 }else{
 
 
 
146 flags |= LOOK_LONE_LF;
147 }
148 if( j>LENGTH_MASK ){
 
 
 
149 flags |= LOOK_LONG; /* Very long line -> binary */
150 }
 
151 j = 0;
152 }else if( c=='\r' ){
153 flags |= LOOK_CR;
154 if( n<=1 || z[1]!='\n' ){
 
 
 
155 flags |= LOOK_LONE_CR; /* Not enough chars or next char not LF */
156 }
157 }
158 }
159 if( n ){
@@ -404,10 +424,11 @@
404 ** Options:
405 ** -n|--limit N Repeat looks-like function N times, for
406 ** performance measurement. Default = 1
407 ** --utf8 Ignoring BOM and file size, force UTF-8 checking
408 ** --utf16 Ignoring BOM and file size, force UTF-16 checking
 
409 **
410 ** FILENAME is the name of a file to check for textual content in the UTF-8
411 ** and/or UTF-16 encodings.
412 */
413 void looks_like_utf_test_cmd(void){
@@ -418,10 +439,11 @@
418 int lookFlags = 0; /* output flags from looks_like_utf8/utf16() */
419 int bRevUtf16 = 0; /* non-zero -> UTF-16 byte order reversed */
420 int fForceUtf8 = find_option("utf8",0,0)!=0;
421 int fForceUtf16 = find_option("utf16",0,0)!=0;
422 const char *zCount = find_option("limit","n",1);
 
423 int nRepeat = 1;
424
425 if( g.argc!=3 ) usage("FILENAME");
426 if( zCount ){
427 nRepeat = atoi(zCount);
@@ -436,11 +458,11 @@
436 fUnicode = could_be_utf16(&blob, 0) || fForceUtf16;
437 }
438 if( fUnicode ){
439 lookFlags = looks_like_utf16(&blob, bRevUtf16, 0);
440 }else{
441 lookFlags = looks_like_utf8(&blob, 0) | invalid_utf8(&blob);
442 }
443 }
444 fossil_print("File \"%s\" has %d bytes.\n",g.argv[2],blob_size(&blob));
445 fossil_print("Starts with UTF-8 BOM: %s\n",fUtf8?"yes":"no");
446 fossil_print("Starts with UTF-16 BOM: %s\n",
447
--- src/lookslike.c
+++ src/lookslike.c
@@ -29,11 +29,11 @@
29 /*
30 ** This macro is designed to return non-zero if the specified blob contains
31 ** data that MAY be binary in nature; otherwise, zero will be returned.
32 */
33 #define looks_like_binary(blob) \
34 ((looks_like_utf8((blob), LOOK_BINARY, 0) & LOOK_BINARY) != LOOK_NONE)
35
36 /*
37 ** Output flags for the looks_like_utf8() and looks_like_utf16() routines used
38 ** to convey status information about the blob content.
39 */
@@ -114,46 +114,66 @@
114 ** This function examines the contents of the blob until one of the flags
115 ** specified in "stopFlags" is set.
116 **
117 ************************************ WARNING **********************************
118 */
119 int looks_like_utf8(const Blob *pContent, int stopFlags, int fVerbose){
120 const char *z = blob_buffer(pContent);
121 unsigned int n = blob_size(pContent);
122 int j, c, flags = LOOK_NONE; /* Assume UTF-8 text, prove otherwise */
123 int nLine = 1;
124
125 if( n==0 ) return flags; /* Empty file -> text */
126 c = *z;
127 if( c==0 ){
128 flags |= LOOK_NUL; /* NUL character in a file -> binary */
129 if( fVerbose ) fossil_print("NUL at start\n");
130 }else if( c=='\r' ){
131 flags |= LOOK_CR;
132 if( fVerbose ) fossil_print("CR at start\n");
133 if( n<=1 || z[1]!='\n' ){
134 flags |= LOOK_LONE_CR; /* Not enough chars or next char not LF */
135 if( fVerbose ) fossil_print("Lone CR at start\n");
136 }
137 }
138 j = (c!='\n');
139 if( !j ) flags |= (LOOK_LF | LOOK_LONE_LF); /* Found LF as first char */
140 while( !(flags&stopFlags) && --n>0 ){
141 int c2 = c;
142 c = *++z; ++j;
143 if( c==0 ){
144 if( fVerbose && !(flags&LOOK_NUL) ){
145 fossil_print("NUL on line %d\n", nLine);
146 }
147 flags |= LOOK_NUL; /* NUL character in a file -> binary */
148 }else if( c=='\n' ){
149 flags |= LOOK_LF;
150 if( c2=='\r' ){
151 if( fVerbose && !(flags&LOOK_CRLF) ){
152 fossil_print("CRLF on line %d\n", nLine);
153 }
154 flags |= (LOOK_CR | LOOK_CRLF); /* Found LF preceded by CR */
155 }else{
156 if( fVerbose && !(flags&LOOK_LONE_LF) ){
157 fossil_print("Lone LF on line %d\n", nLine);
158 }
159 flags |= LOOK_LONE_LF;
160 }
161 if( j>LENGTH_MASK ){
162 if( fVerbose && !(flags&LOOK_LONG) ){
163 fossil_print("Line %d is longer than %d bytes\n", nLine, j);
164 }
165 flags |= LOOK_LONG; /* Very long line -> binary */
166 }
167 ++nLine;
168 j = 0;
169 }else if( c=='\r' ){
170 flags |= LOOK_CR;
171 if( n<=1 || z[1]!='\n' ){
172 if( fVerbose && !(flags&LOOK_LONE_CR) ){
173 fossil_print("Lone CR on line %d\n", nLine);
174 }
175 flags |= LOOK_LONE_CR; /* Not enough chars or next char not LF */
176 }
177 }
178 }
179 if( n ){
@@ -404,10 +424,11 @@
424 ** Options:
425 ** -n|--limit N Repeat looks-like function N times, for
426 ** performance measurement. Default = 1
427 ** --utf8 Ignoring BOM and file size, force UTF-8 checking
428 ** --utf16 Ignoring BOM and file size, force UTF-16 checking
429 ** -v|--verbose Report the line numbers where each flag is first set
430 **
431 ** FILENAME is the name of a file to check for textual content in the UTF-8
432 ** and/or UTF-16 encodings.
433 */
434 void looks_like_utf_test_cmd(void){
@@ -418,10 +439,11 @@
439 int lookFlags = 0; /* output flags from looks_like_utf8/utf16() */
440 int bRevUtf16 = 0; /* non-zero -> UTF-16 byte order reversed */
441 int fForceUtf8 = find_option("utf8",0,0)!=0;
442 int fForceUtf16 = find_option("utf16",0,0)!=0;
443 const char *zCount = find_option("limit","n",1);
444 int fVerbose = find_option("verbose","v",0)!=0;
445 int nRepeat = 1;
446
447 if( g.argc!=3 ) usage("FILENAME");
448 if( zCount ){
449 nRepeat = atoi(zCount);
@@ -436,11 +458,11 @@
458 fUnicode = could_be_utf16(&blob, 0) || fForceUtf16;
459 }
460 if( fUnicode ){
461 lookFlags = looks_like_utf16(&blob, bRevUtf16, 0);
462 }else{
463 lookFlags = looks_like_utf8(&blob, 0, fVerbose) | invalid_utf8(&blob);
464 }
465 }
466 fossil_print("File \"%s\" has %d bytes.\n",g.argv[2],blob_size(&blob));
467 fossil_print("Starts with UTF-8 BOM: %s\n",fUtf8?"yes":"no");
468 fossil_print("Starts with UTF-16 BOM: %s\n",
469

Keyboard Shortcuts

Open search /
Next entry (timeline) j
Previous entry (timeline) k
Open focused entry Enter
Show this help ?
Toggle theme Top nav button