Fossil SCM

Enhance the html_to_plaintext() routine with a flags argument so that it can take over the job previously done by search_snippet_to_plaintext().

drh 2025-03-21 18:31 trunk
Commit fd59eb178bbc2b8338c6cdc43fe7856e98ad94a120443bd1e366f3b9f2e0703c
+1 -1
--- src/checkin.c
+++ src/checkin.c
@@ -2394,11 +2394,11 @@
23942394
Blob in, html, txt;
23952395
blob_init(&in, blob_str(pComment), -1);
23962396
blob_init(&html, 0, 0);
23972397
blob_init(&txt, 0, 0);
23982398
wiki_convert(&in, &html, WIKI_INLINE);
2399
- html_to_plaintext(blob_str(&html), &txt);
2399
+ html_to_plaintext(blob_str(&html), &txt, HTOT_VT100);
24002400
if( nGot>0 ){
24012401
fossil_print(
24022402
"The comment uses special character%s \"%s\". "
24032403
"Does it render as you expect?\n\n ",
24042404
(nGot>1 ? "s" : ""), zGot
24052405
--- src/checkin.c
+++ src/checkin.c
@@ -2394,11 +2394,11 @@
2394 Blob in, html, txt;
2395 blob_init(&in, blob_str(pComment), -1);
2396 blob_init(&html, 0, 0);
2397 blob_init(&txt, 0, 0);
2398 wiki_convert(&in, &html, WIKI_INLINE);
2399 html_to_plaintext(blob_str(&html), &txt);
2400 if( nGot>0 ){
2401 fossil_print(
2402 "The comment uses special character%s \"%s\". "
2403 "Does it render as you expect?\n\n ",
2404 (nGot>1 ? "s" : ""), zGot
2405
--- src/checkin.c
+++ src/checkin.c
@@ -2394,11 +2394,11 @@
2394 Blob in, html, txt;
2395 blob_init(&in, blob_str(pComment), -1);
2396 blob_init(&html, 0, 0);
2397 blob_init(&txt, 0, 0);
2398 wiki_convert(&in, &html, WIKI_INLINE);
2399 html_to_plaintext(blob_str(&html), &txt, HTOT_VT100);
2400 if( nGot>0 ){
2401 fossil_print(
2402 "The comment uses special character%s \"%s\". "
2403 "Does it render as you expect?\n\n ",
2404 (nGot>1 ? "s" : ""), zGot
2405
+4 -4
--- src/comformat.c
+++ src/comformat.c
@@ -266,14 +266,14 @@
266266
int i = 0; /* Counted bytes. */
267267
int cchUTF8 = 1; /* Code units consumed. */
268268
int maxUTF8 = 1; /* Expected sequence length. */
269269
char c = z[i++];
270270
if( c==0x1b && z[i]=='[' ){
271
- do{
272
- i++;
273
- }while( i<fossil_isdigit(z[i]) || z[i]==';' );
274
- if( fossil_isalpha(z[i]) ){
271
+ i++;
272
+ while( z[i]>=0x30 && z[i]<=0x3f ){ i++; }
273
+ while( z[i]>=0x20 && z[i]<=0x2f ){ i++; }
274
+ if( z[i]>=0x40 && z[i]<=0x7e ){
275275
*pCchUTF8 = i+1;
276276
*pUtf32 = 0x301; /* A zero-width character */
277277
return;
278278
}
279279
}
280280
--- src/comformat.c
+++ src/comformat.c
@@ -266,14 +266,14 @@
266 int i = 0; /* Counted bytes. */
267 int cchUTF8 = 1; /* Code units consumed. */
268 int maxUTF8 = 1; /* Expected sequence length. */
269 char c = z[i++];
270 if( c==0x1b && z[i]=='[' ){
271 do{
272 i++;
273 }while( i<fossil_isdigit(z[i]) || z[i]==';' );
274 if( fossil_isalpha(z[i]) ){
275 *pCchUTF8 = i+1;
276 *pUtf32 = 0x301; /* A zero-width character */
277 return;
278 }
279 }
280
--- src/comformat.c
+++ src/comformat.c
@@ -266,14 +266,14 @@
266 int i = 0; /* Counted bytes. */
267 int cchUTF8 = 1; /* Code units consumed. */
268 int maxUTF8 = 1; /* Expected sequence length. */
269 char c = z[i++];
270 if( c==0x1b && z[i]=='[' ){
271 i++;
272 while( z[i]>=0x30 && z[i]<=0x3f ){ i++; }
273 while( z[i]>=0x20 && z[i]<=0x2f ){ i++; }
274 if( z[i]>=0x40 && z[i]<=0x7e ){
275 *pCchUTF8 = i+1;
276 *pUtf32 = 0x301; /* A zero-width character */
277 return;
278 }
279 }
280
+1 -1
--- src/doc.c
+++ src/doc.c
@@ -639,11 +639,11 @@
639639
** are special to HTML encoded. We need to decode these before turning
640640
** the text into a title, as the title text will be reencoded later */
641641
char *zTitle = mprintf("%.*s", nValue, zValue);
642642
int i;
643643
for(i=0; fossil_isspace(zTitle[i]); i++){}
644
- html_to_plaintext(zTitle+i, pTitle);
644
+ html_to_plaintext(zTitle+i, pTitle, 0);
645645
fossil_free(zTitle);
646646
seenTitle = 1;
647647
if( seenClass ) return 1;
648648
}
649649
}
650650
--- src/doc.c
+++ src/doc.c
@@ -639,11 +639,11 @@
639 ** are special to HTML encoded. We need to decode these before turning
640 ** the text into a title, as the title text will be reencoded later */
641 char *zTitle = mprintf("%.*s", nValue, zValue);
642 int i;
643 for(i=0; fossil_isspace(zTitle[i]); i++){}
644 html_to_plaintext(zTitle+i, pTitle);
645 fossil_free(zTitle);
646 seenTitle = 1;
647 if( seenClass ) return 1;
648 }
649 }
650
--- src/doc.c
+++ src/doc.c
@@ -639,11 +639,11 @@
639 ** are special to HTML encoded. We need to decode these before turning
640 ** the text into a title, as the title text will be reencoded later */
641 char *zTitle = mprintf("%.*s", nValue, zValue);
642 int i;
643 for(i=0; fossil_isspace(zTitle[i]); i++){}
644 html_to_plaintext(zTitle+i, pTitle, 0);
645 fossil_free(zTitle);
646 seenTitle = 1;
647 if( seenClass ) return 1;
648 }
649 }
650
+8 -87
--- src/search.c
+++ src/search.c
@@ -559,93 +559,10 @@
559559
search_body_sqlfunc, 0, 0);
560560
sqlite3_create_function(db, "urlencode", 1, enc, 0,
561561
search_urlencode_sqlfunc, 0, 0);
562562
}
563563
564
-/*
565
-** The pSnip input contains snippet text from a search formatted
566
-** as HTML. Attempt to make that text more readable on a TTY.
567
-**
568
-** If nTty is positive, use ANSI escape codes "\e[Nm" where N is nTty
569
-** to highly marked text.
570
-*/
571
-void search_snippet_to_plaintext(Blob *pSnip, int nTty){
572
- char *zSnip;
573
- unsigned int j, k;
574
-
575
- zSnip = pSnip->aData;
576
- for(j=k=0; j<pSnip->nUsed; j++){
577
- char c = zSnip[j];
578
- if( c=='<' ){
579
- if( memcmp(&zSnip[j],"<mark>",6)==0 ){
580
- if( nTty ){
581
- zSnip[k++] = 0x1b;
582
- zSnip[k++] = '[';
583
- if( nTty>=10 ) zSnip[k++] = (nTty/10)%10 + '0';
584
- zSnip[k++] = nTty%10 + '0';
585
- zSnip[k++] = 'm';
586
- }
587
- j += 5;
588
- }else if( memcmp(&zSnip[j],"</mark>",7)==0 ){
589
- if( nTty ){
590
- zSnip[k++] = 0x1b;
591
- zSnip[k++] = '[';
592
- zSnip[k++] = '0';
593
- zSnip[k++] = 'm';
594
- }
595
- j += 6;
596
- }else{
597
- zSnip[k++] = zSnip[j];
598
- }
599
- }else if( fossil_isspace(c) ){
600
- zSnip[k++] = ' ';
601
- while( fossil_isspace(zSnip[j+1]) ) j++;
602
- }else if( c=='&' ){
603
- if( zSnip[j+1]=='#' && fossil_isdigit(zSnip[j+2]) ){
604
- int n = 3;
605
- int x = zSnip[j+2] - '0';
606
- if( fossil_isdigit(zSnip[j+3]) ){
607
- x = x*10 + zSnip[j+3] - '0';
608
- n++;
609
- if( fossil_isdigit(zSnip[j+4]) ){
610
- x = x*10 + zSnip[j+4] - '0';
611
- n++;
612
- }
613
- }
614
- if( zSnip[j+n]==';' ){
615
- zSnip[k++] = (char)x;
616
- j += n;
617
- }else{
618
- zSnip[k++] = c;
619
- }
620
- }else if( memcmp(&zSnip[j],"&lt;",4)==0 ){
621
- zSnip[k++] = '<';
622
- j += 3;
623
- }else if( memcmp(&zSnip[j],"&gt;",4)==0 ){
624
- zSnip[k++] = '>';
625
- j += 3;
626
- }else if( memcmp(&zSnip[j],"&quot;",6)==0 ){
627
- zSnip[k++] = '"';
628
- j += 5;
629
- }else if( memcmp(&zSnip[j],"&amp;",5)==0 ){
630
- zSnip[k++] = '&';
631
- j += 4;
632
- }else{
633
- zSnip[k++] = c;
634
- }
635
- }else if( c=='%' && strncmp(&zSnip[j],"%fossil",7)==0 ){
636
- /* no-op */
637
- }else if( (c=='[' || c==']') && zSnip[j+1]==c ){
638
- j++;
639
- }else{
640
- zSnip[k++] = c;
641
- }
642
- }
643
- zSnip[k] = 0;
644
- pSnip->nUsed = k;
645
-}
646
-
647564
/*
648565
** Testing the search function.
649566
**
650567
** COMMAND: search*
651568
**
@@ -805,19 +722,23 @@
805722
db_prepare(&q, "SELECT snip, label, score, id, date"
806723
" FROM x"
807724
" ORDER BY score DESC, date DESC;");
808725
blob_init(&com, 0, 0);
809726
blob_init(&snip, 0, 0);
810
- if( width<0 ) width = 80;
727
+ if( width<0 ) width = terminal_get_width(80);
811728
while( db_step(&q)==SQLITE_ROW ){
812729
const char *zSnippet = db_column_text(&q, 0);
813730
const char *zLabel = db_column_text(&q, 1);
814731
const char *zDate = db_column_text(&q, 4);
815732
const char *zScore = db_column_text(&q, 2);
816733
const char *zId = db_column_text(&q, 3);
734
+ char *zOrig;
817735
blob_appendf(&snip, "%s", zSnippet);
818
- search_snippet_to_plaintext(&snip, nTty);
736
+ zOrig = blob_materialize(&snip);
737
+ blob_init(&snip, 0, 0);
738
+ html_to_plaintext(zOrig, &snip, (nTty ? HTOT_VT100 : 0)|HTOT_NO_WS);
739
+ fossil_free(zOrig);
819740
blob_appendf(&com, "%s\n%s\n%s", zLabel, blob_str(&snip), zDate);
820741
if( bDebug ){
821742
blob_appendf(&com," score: %s id: %s", zScore, zId);
822743
}
823744
comment_print(blob_str(&com), 0, 5, width,
@@ -1557,20 +1478,20 @@
15571478
}else{
15581479
blob_append(pOut, "\n", 1);
15591480
wiki_convert(pIn, &html, 0);
15601481
}
15611482
}
1562
- html_to_plaintext(blob_str(&html), pOut);
1483
+ html_to_plaintext(blob_str(&html), pOut, 0);
15631484
}else if( fossil_strcmp(zMimetype,"text/x-markdown")==0 ){
15641485
markdown_to_html(pIn, blob_size(&title) ? NULL : &title, &html);
15651486
}else if( fossil_strcmp(zMimetype,"text/html")==0 ){
15661487
if( blob_size(&title)==0 ) doc_is_embedded_html(pIn, &title);
15671488
pHtml = pIn;
15681489
}
15691490
blob_appendf(pOut, "%s\n", blob_str(&title));
15701491
if( blob_size(pHtml) ){
1571
- html_to_plaintext(blob_str(pHtml), pOut);
1492
+ html_to_plaintext(blob_str(pHtml), pOut, 0);
15721493
}else{
15731494
blob_append(pOut, blob_buffer(pIn), blob_size(pIn));
15741495
}
15751496
blob_reset(&html);
15761497
blob_reset(&title);
15771498
--- src/search.c
+++ src/search.c
@@ -559,93 +559,10 @@
559 search_body_sqlfunc, 0, 0);
560 sqlite3_create_function(db, "urlencode", 1, enc, 0,
561 search_urlencode_sqlfunc, 0, 0);
562 }
563
564 /*
565 ** The pSnip input contains snippet text from a search formatted
566 ** as HTML. Attempt to make that text more readable on a TTY.
567 **
568 ** If nTty is positive, use ANSI escape codes "\e[Nm" where N is nTty
569 ** to highly marked text.
570 */
571 void search_snippet_to_plaintext(Blob *pSnip, int nTty){
572 char *zSnip;
573 unsigned int j, k;
574
575 zSnip = pSnip->aData;
576 for(j=k=0; j<pSnip->nUsed; j++){
577 char c = zSnip[j];
578 if( c=='<' ){
579 if( memcmp(&zSnip[j],"<mark>",6)==0 ){
580 if( nTty ){
581 zSnip[k++] = 0x1b;
582 zSnip[k++] = '[';
583 if( nTty>=10 ) zSnip[k++] = (nTty/10)%10 + '0';
584 zSnip[k++] = nTty%10 + '0';
585 zSnip[k++] = 'm';
586 }
587 j += 5;
588 }else if( memcmp(&zSnip[j],"</mark>",7)==0 ){
589 if( nTty ){
590 zSnip[k++] = 0x1b;
591 zSnip[k++] = '[';
592 zSnip[k++] = '0';
593 zSnip[k++] = 'm';
594 }
595 j += 6;
596 }else{
597 zSnip[k++] = zSnip[j];
598 }
599 }else if( fossil_isspace(c) ){
600 zSnip[k++] = ' ';
601 while( fossil_isspace(zSnip[j+1]) ) j++;
602 }else if( c=='&' ){
603 if( zSnip[j+1]=='#' && fossil_isdigit(zSnip[j+2]) ){
604 int n = 3;
605 int x = zSnip[j+2] - '0';
606 if( fossil_isdigit(zSnip[j+3]) ){
607 x = x*10 + zSnip[j+3] - '0';
608 n++;
609 if( fossil_isdigit(zSnip[j+4]) ){
610 x = x*10 + zSnip[j+4] - '0';
611 n++;
612 }
613 }
614 if( zSnip[j+n]==';' ){
615 zSnip[k++] = (char)x;
616 j += n;
617 }else{
618 zSnip[k++] = c;
619 }
620 }else if( memcmp(&zSnip[j],"&lt;",4)==0 ){
621 zSnip[k++] = '<';
622 j += 3;
623 }else if( memcmp(&zSnip[j],"&gt;",4)==0 ){
624 zSnip[k++] = '>';
625 j += 3;
626 }else if( memcmp(&zSnip[j],"&quot;",6)==0 ){
627 zSnip[k++] = '"';
628 j += 5;
629 }else if( memcmp(&zSnip[j],"&amp;",5)==0 ){
630 zSnip[k++] = '&';
631 j += 4;
632 }else{
633 zSnip[k++] = c;
634 }
635 }else if( c=='%' && strncmp(&zSnip[j],"%fossil",7)==0 ){
636 /* no-op */
637 }else if( (c=='[' || c==']') && zSnip[j+1]==c ){
638 j++;
639 }else{
640 zSnip[k++] = c;
641 }
642 }
643 zSnip[k] = 0;
644 pSnip->nUsed = k;
645 }
646
647 /*
648 ** Testing the search function.
649 **
650 ** COMMAND: search*
651 **
@@ -805,19 +722,23 @@
805 db_prepare(&q, "SELECT snip, label, score, id, date"
806 " FROM x"
807 " ORDER BY score DESC, date DESC;");
808 blob_init(&com, 0, 0);
809 blob_init(&snip, 0, 0);
810 if( width<0 ) width = 80;
811 while( db_step(&q)==SQLITE_ROW ){
812 const char *zSnippet = db_column_text(&q, 0);
813 const char *zLabel = db_column_text(&q, 1);
814 const char *zDate = db_column_text(&q, 4);
815 const char *zScore = db_column_text(&q, 2);
816 const char *zId = db_column_text(&q, 3);
 
817 blob_appendf(&snip, "%s", zSnippet);
818 search_snippet_to_plaintext(&snip, nTty);
 
 
 
819 blob_appendf(&com, "%s\n%s\n%s", zLabel, blob_str(&snip), zDate);
820 if( bDebug ){
821 blob_appendf(&com," score: %s id: %s", zScore, zId);
822 }
823 comment_print(blob_str(&com), 0, 5, width,
@@ -1557,20 +1478,20 @@
1557 }else{
1558 blob_append(pOut, "\n", 1);
1559 wiki_convert(pIn, &html, 0);
1560 }
1561 }
1562 html_to_plaintext(blob_str(&html), pOut);
1563 }else if( fossil_strcmp(zMimetype,"text/x-markdown")==0 ){
1564 markdown_to_html(pIn, blob_size(&title) ? NULL : &title, &html);
1565 }else if( fossil_strcmp(zMimetype,"text/html")==0 ){
1566 if( blob_size(&title)==0 ) doc_is_embedded_html(pIn, &title);
1567 pHtml = pIn;
1568 }
1569 blob_appendf(pOut, "%s\n", blob_str(&title));
1570 if( blob_size(pHtml) ){
1571 html_to_plaintext(blob_str(pHtml), pOut);
1572 }else{
1573 blob_append(pOut, blob_buffer(pIn), blob_size(pIn));
1574 }
1575 blob_reset(&html);
1576 blob_reset(&title);
1577
--- src/search.c
+++ src/search.c
@@ -559,93 +559,10 @@
559 search_body_sqlfunc, 0, 0);
560 sqlite3_create_function(db, "urlencode", 1, enc, 0,
561 search_urlencode_sqlfunc, 0, 0);
562 }
563
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
564 /*
565 ** Testing the search function.
566 **
567 ** COMMAND: search*
568 **
@@ -805,19 +722,23 @@
722 db_prepare(&q, "SELECT snip, label, score, id, date"
723 " FROM x"
724 " ORDER BY score DESC, date DESC;");
725 blob_init(&com, 0, 0);
726 blob_init(&snip, 0, 0);
727 if( width<0 ) width = terminal_get_width(80);
728 while( db_step(&q)==SQLITE_ROW ){
729 const char *zSnippet = db_column_text(&q, 0);
730 const char *zLabel = db_column_text(&q, 1);
731 const char *zDate = db_column_text(&q, 4);
732 const char *zScore = db_column_text(&q, 2);
733 const char *zId = db_column_text(&q, 3);
734 char *zOrig;
735 blob_appendf(&snip, "%s", zSnippet);
736 zOrig = blob_materialize(&snip);
737 blob_init(&snip, 0, 0);
738 html_to_plaintext(zOrig, &snip, (nTty ? HTOT_VT100 : 0)|HTOT_NO_WS);
739 fossil_free(zOrig);
740 blob_appendf(&com, "%s\n%s\n%s", zLabel, blob_str(&snip), zDate);
741 if( bDebug ){
742 blob_appendf(&com," score: %s id: %s", zScore, zId);
743 }
744 comment_print(blob_str(&com), 0, 5, width,
@@ -1557,20 +1478,20 @@
1478 }else{
1479 blob_append(pOut, "\n", 1);
1480 wiki_convert(pIn, &html, 0);
1481 }
1482 }
1483 html_to_plaintext(blob_str(&html), pOut, 0);
1484 }else if( fossil_strcmp(zMimetype,"text/x-markdown")==0 ){
1485 markdown_to_html(pIn, blob_size(&title) ? NULL : &title, &html);
1486 }else if( fossil_strcmp(zMimetype,"text/html")==0 ){
1487 if( blob_size(&title)==0 ) doc_is_embedded_html(pIn, &title);
1488 pHtml = pIn;
1489 }
1490 blob_appendf(pOut, "%s\n", blob_str(&title));
1491 if( blob_size(pHtml) ){
1492 html_to_plaintext(blob_str(pHtml), pOut, 0);
1493 }else{
1494 blob_append(pOut, blob_buffer(pIn), blob_size(pIn));
1495 }
1496 blob_reset(&html);
1497 blob_reset(&title);
1498
+1 -1
--- src/timeline.c
+++ src/timeline.c
@@ -3437,11 +3437,11 @@
34373437
blob_init(&in, zIn, nIn);
34383438
blob_init(&html, 0, 0);
34393439
wiki_convert(&in, &html, WIKI_INLINE);
34403440
blob_reset(&in);
34413441
blob_init(&txt, 0, 0);
3442
- html_to_plaintext(blob_str(&html), &txt);
3442
+ html_to_plaintext(blob_str(&html), &txt, 0);
34433443
blob_reset(&html);
34443444
nOut = blob_size(&txt);
34453445
zOut = blob_str(&txt);
34463446
while( fossil_isspace(zOut[0]) ){ zOut++; nOut--; }
34473447
while( nOut>0 && fossil_isspace(zOut[nOut-1]) ){ nOut--; }
34483448
--- src/timeline.c
+++ src/timeline.c
@@ -3437,11 +3437,11 @@
3437 blob_init(&in, zIn, nIn);
3438 blob_init(&html, 0, 0);
3439 wiki_convert(&in, &html, WIKI_INLINE);
3440 blob_reset(&in);
3441 blob_init(&txt, 0, 0);
3442 html_to_plaintext(blob_str(&html), &txt);
3443 blob_reset(&html);
3444 nOut = blob_size(&txt);
3445 zOut = blob_str(&txt);
3446 while( fossil_isspace(zOut[0]) ){ zOut++; nOut--; }
3447 while( nOut>0 && fossil_isspace(zOut[nOut-1]) ){ nOut--; }
3448
--- src/timeline.c
+++ src/timeline.c
@@ -3437,11 +3437,11 @@
3437 blob_init(&in, zIn, nIn);
3438 blob_init(&html, 0, 0);
3439 wiki_convert(&in, &html, WIKI_INLINE);
3440 blob_reset(&in);
3441 blob_init(&txt, 0, 0);
3442 html_to_plaintext(blob_str(&html), &txt, 0);
3443 blob_reset(&html);
3444 nOut = blob_size(&txt);
3445 zOut = blob_str(&txt);
3446 while( fossil_isspace(zOut[0]) ){ zOut++; nOut--; }
3447 while( nOut>0 && fossil_isspace(zOut[nOut-1]) ){ nOut--; }
3448
+48 -11
--- src/wikiformat.c
+++ src/wikiformat.c
@@ -1975,11 +1975,11 @@
19751975
blob_read_from_file(&in, g.argv[2], ExtFILE);
19761976
mType = wiki_convert(&in, &out, flags);
19771977
if( bText ){
19781978
Blob txt;
19791979
blob_init(&txt, 0, 0);
1980
- html_to_plaintext(blob_str(&out),&txt);
1980
+ html_to_plaintext(blob_str(&out),&txt, HTOT_VT100);
19811981
blob_reset(&out);
19821982
out = txt;
19831983
}
19841984
blob_write_to_file(&out, "-");
19851985
if( showType ){
@@ -2034,11 +2034,11 @@
20342034
safe_html_context( bSafe ? DOCSRC_UNTRUSTED : DOCSRC_TRUSTED );
20352035
safe_html(&out);
20362036
if( bText ){
20372037
Blob txt;
20382038
blob_init(&txt, 0, 0);
2039
- html_to_plaintext(blob_str(&out), &txt);
2039
+ html_to_plaintext(blob_str(&out), &txt, HTOT_VT100);
20402040
blob_reset(&out);
20412041
out = txt;
20422042
}
20432043
blob_write_to_file(&out, "-");
20442044
blob_reset(&in);
@@ -2505,25 +2505,37 @@
25052505
fossil_puts(blob_buffer(&out), 0, blob_size(&out));
25062506
blob_reset(&out);
25072507
}
25082508
}
25092509
2510
+#if INTERFACE
2511
+/*
2512
+** Allowed flag options for html_to_plaintext().
2513
+*/
2514
+#define HTOT_VT100 0x0001 /* <mark> becomes ^[[91m */
2515
+#define HTOT_NO_WS 0x0002 /* Collapse whitespace to a single space */
2516
+
2517
+#endif /* INTERFACE */
2518
+
25102519
/*
25112520
** Remove all HTML markup from the input text. The output written into
25122521
** pOut is pure text.
25132522
**
25142523
** Put the title on the first line, if there is any <title> markup.
25152524
** If there is no <title>, then create a blank first line.
25162525
*/
2517
-void html_to_plaintext(const char *zIn, Blob *pOut){
2526
+void html_to_plaintext(const char *zIn, Blob *pOut, int mFlags){
25182527
int n;
25192528
int i, j;
2520
- int inTitle = 0; /* True between <title>...</title> */
2529
+ int bNoWS = 0; /* Transform WS into a single space */
25212530
int seenText = 0; /* True after first non-whitespace seen */
25222531
int nNL = 0; /* Number of \n characters at the end of pOut */
25232532
int nWS = 0; /* True if pOut ends with whitespace */
2524
- while( fossil_isspace(zIn[0]) ) zIn++;
2533
+ int nMark = 0; /* True if inside of <mark>..</mark> */
2534
+
2535
+ while( fossil_isspace(zIn[0]) ) zIn++; /* Skip leading whitespace */
2536
+ if( mFlags & HTOT_NO_WS ) bNoWS = 1;
25252537
while( zIn[0] ){
25262538
n = html_token_length(zIn);
25272539
if( zIn[0]=='<' && n>1 ){
25282540
int isCloseTag;
25292541
int eTag;
@@ -2543,13 +2555,30 @@
25432555
if( fossil_strnicmp(zIn, "</style",7)==0 ) break;
25442556
zIn += n;
25452557
}
25462558
if( zIn[0]=='<' ) zIn += n;
25472559
continue;
2560
+ }
2561
+ if( eTag==MARKUP_INVALID && strcmp(zTag,"mark")==0 ){
2562
+ if( (mFlags & HTOT_VT100)!=0 ){
2563
+ if( isCloseTag && nMark ){
2564
+ blob_append(pOut, "\033[0m", 4);
2565
+ nMark = 0;
2566
+ }else if( !isCloseTag && !nMark ){
2567
+ blob_append(pOut, "\033[91m", 5);
2568
+ nMark = 1;
2569
+ }
2570
+ }
2571
+ zIn += n;
2572
+ continue;
25482573
}
25492574
if( eTag==MARKUP_TITLE ){
2550
- inTitle = !isCloseTag;
2575
+ if( isCloseTag && (mFlags & HTOT_NO_WS)==0 ){
2576
+ bNoWS = 0;
2577
+ }else{
2578
+ bNoWS = 1;
2579
+ }
25512580
}
25522581
if( !isCloseTag && seenText && (eType & (MUTYPE_BLOCK|MUTYPE_TABLE))!=0 ){
25532582
if( nNL==0 ){
25542583
blob_append_char(pOut, '\n');
25552584
nNL++;
@@ -2557,11 +2586,11 @@
25572586
nWS = 1;
25582587
}
25592588
}else if( fossil_isspace(zIn[0]) ){
25602589
if( seenText ){
25612590
nNL = 0;
2562
- if( !inTitle ){ /* '\n' -> ' ' within <title> */
2591
+ if( !bNoWS ){ /* '\n' -> ' ' within <title> */
25632592
for(i=0; i<n; i++) if( zIn[i]=='\n' ) nNL++;
25642593
}
25652594
if( !nWS ){
25662595
blob_append_char(pOut, nNL ? '\n' : ' ');
25672596
nWS = 1;
@@ -2591,11 +2620,11 @@
25912620
if( fossil_isspace(c) ){
25922621
if( nWS==0 && seenText ) blob_append_char(pOut, c);
25932622
nWS = 1;
25942623
nNL = c=='\n';
25952624
}else{
2596
- if( !seenText && !inTitle ) blob_append_char(pOut, '\n');
2625
+ if( !seenText && !bNoWS ) blob_append_char(pOut, '\n');
25972626
seenText = 1;
25982627
nNL = nWS = 0;
25992628
if( c<0x00080 ){
26002629
blob_append_char(pOut, c & 0xff);
26012630
}else if( c<0x00800 ){
@@ -2611,40 +2640,48 @@
26112640
blob_append_char(pOut, 0x80 + (u8)((c>>6)&0x3f));
26122641
blob_append_char(pOut, 0x80 + (u8)(c&0x3f));
26132642
}
26142643
}
26152644
}else{
2616
- if( !seenText && !inTitle ) blob_append_char(pOut, '\n');
2645
+ if( !seenText && !bNoWS ) blob_append_char(pOut, '\n');
26172646
seenText = 1;
26182647
nNL = nWS = 0;
26192648
blob_append(pOut, zIn, n);
26202649
}
26212650
zIn += n;
26222651
}
2652
+ if( nMark ) blob_append(pOut, "\033[0m", 4);
26232653
if( nNL==0 ) blob_append_char(pOut, '\n');
26242654
}
26252655
26262656
/*
26272657
** COMMAND: test-html-to-text
26282658
**
2629
-** Usage: %fossil test-html-to-text FILE ...
2659
+** Usage: %fossil test-html-to-text [OPTIONS] FILE ...
26302660
**
26312661
** Read all files named on the command-line. Convert the file
26322662
** content from HTML to text and write the results on standard
26332663
** output.
26342664
**
26352665
** This command is intended as a test and debug interface for
26362666
** the html_to_plaintext() routine.
2667
+**
2668
+** Options:
2669
+**
2670
+** --vt100 Translate <mark> and </mark> into ANSI/VT100
2671
+** escapes to highlight the contained text.
26372672
*/
26382673
void test_html_to_text(void){
26392674
Blob in, out;
26402675
int i;
2676
+ int mFlags = 0;
2677
+ if( find_option("vt100",0,0)!=0 ) mFlags |= HTOT_VT100;
26412678
26422679
for(i=2; i<g.argc; i++){
26432680
blob_read_from_file(&in, g.argv[i], ExtFILE);
26442681
blob_zero(&out);
2645
- html_to_plaintext(blob_str(&in), &out);
2682
+ html_to_plaintext(blob_str(&in), &out, mFlags);
26462683
blob_reset(&in);
26472684
fossil_puts(blob_buffer(&out), 0, blob_size(&out));
26482685
blob_reset(&out);
26492686
}
26502687
}
26512688
--- src/wikiformat.c
+++ src/wikiformat.c
@@ -1975,11 +1975,11 @@
1975 blob_read_from_file(&in, g.argv[2], ExtFILE);
1976 mType = wiki_convert(&in, &out, flags);
1977 if( bText ){
1978 Blob txt;
1979 blob_init(&txt, 0, 0);
1980 html_to_plaintext(blob_str(&out),&txt);
1981 blob_reset(&out);
1982 out = txt;
1983 }
1984 blob_write_to_file(&out, "-");
1985 if( showType ){
@@ -2034,11 +2034,11 @@
2034 safe_html_context( bSafe ? DOCSRC_UNTRUSTED : DOCSRC_TRUSTED );
2035 safe_html(&out);
2036 if( bText ){
2037 Blob txt;
2038 blob_init(&txt, 0, 0);
2039 html_to_plaintext(blob_str(&out), &txt);
2040 blob_reset(&out);
2041 out = txt;
2042 }
2043 blob_write_to_file(&out, "-");
2044 blob_reset(&in);
@@ -2505,25 +2505,37 @@
2505 fossil_puts(blob_buffer(&out), 0, blob_size(&out));
2506 blob_reset(&out);
2507 }
2508 }
2509
 
 
 
 
 
 
 
 
 
2510 /*
2511 ** Remove all HTML markup from the input text. The output written into
2512 ** pOut is pure text.
2513 **
2514 ** Put the title on the first line, if there is any <title> markup.
2515 ** If there is no <title>, then create a blank first line.
2516 */
2517 void html_to_plaintext(const char *zIn, Blob *pOut){
2518 int n;
2519 int i, j;
2520 int inTitle = 0; /* True between <title>...</title> */
2521 int seenText = 0; /* True after first non-whitespace seen */
2522 int nNL = 0; /* Number of \n characters at the end of pOut */
2523 int nWS = 0; /* True if pOut ends with whitespace */
2524 while( fossil_isspace(zIn[0]) ) zIn++;
 
 
 
2525 while( zIn[0] ){
2526 n = html_token_length(zIn);
2527 if( zIn[0]=='<' && n>1 ){
2528 int isCloseTag;
2529 int eTag;
@@ -2543,13 +2555,30 @@
2543 if( fossil_strnicmp(zIn, "</style",7)==0 ) break;
2544 zIn += n;
2545 }
2546 if( zIn[0]=='<' ) zIn += n;
2547 continue;
 
 
 
 
 
 
 
 
 
 
 
 
 
2548 }
2549 if( eTag==MARKUP_TITLE ){
2550 inTitle = !isCloseTag;
 
 
 
 
2551 }
2552 if( !isCloseTag && seenText && (eType & (MUTYPE_BLOCK|MUTYPE_TABLE))!=0 ){
2553 if( nNL==0 ){
2554 blob_append_char(pOut, '\n');
2555 nNL++;
@@ -2557,11 +2586,11 @@
2557 nWS = 1;
2558 }
2559 }else if( fossil_isspace(zIn[0]) ){
2560 if( seenText ){
2561 nNL = 0;
2562 if( !inTitle ){ /* '\n' -> ' ' within <title> */
2563 for(i=0; i<n; i++) if( zIn[i]=='\n' ) nNL++;
2564 }
2565 if( !nWS ){
2566 blob_append_char(pOut, nNL ? '\n' : ' ');
2567 nWS = 1;
@@ -2591,11 +2620,11 @@
2591 if( fossil_isspace(c) ){
2592 if( nWS==0 && seenText ) blob_append_char(pOut, c);
2593 nWS = 1;
2594 nNL = c=='\n';
2595 }else{
2596 if( !seenText && !inTitle ) blob_append_char(pOut, '\n');
2597 seenText = 1;
2598 nNL = nWS = 0;
2599 if( c<0x00080 ){
2600 blob_append_char(pOut, c & 0xff);
2601 }else if( c<0x00800 ){
@@ -2611,40 +2640,48 @@
2611 blob_append_char(pOut, 0x80 + (u8)((c>>6)&0x3f));
2612 blob_append_char(pOut, 0x80 + (u8)(c&0x3f));
2613 }
2614 }
2615 }else{
2616 if( !seenText && !inTitle ) blob_append_char(pOut, '\n');
2617 seenText = 1;
2618 nNL = nWS = 0;
2619 blob_append(pOut, zIn, n);
2620 }
2621 zIn += n;
2622 }
 
2623 if( nNL==0 ) blob_append_char(pOut, '\n');
2624 }
2625
2626 /*
2627 ** COMMAND: test-html-to-text
2628 **
2629 ** Usage: %fossil test-html-to-text FILE ...
2630 **
2631 ** Read all files named on the command-line. Convert the file
2632 ** content from HTML to text and write the results on standard
2633 ** output.
2634 **
2635 ** This command is intended as a test and debug interface for
2636 ** the html_to_plaintext() routine.
 
 
 
 
 
2637 */
2638 void test_html_to_text(void){
2639 Blob in, out;
2640 int i;
 
 
2641
2642 for(i=2; i<g.argc; i++){
2643 blob_read_from_file(&in, g.argv[i], ExtFILE);
2644 blob_zero(&out);
2645 html_to_plaintext(blob_str(&in), &out);
2646 blob_reset(&in);
2647 fossil_puts(blob_buffer(&out), 0, blob_size(&out));
2648 blob_reset(&out);
2649 }
2650 }
2651
--- src/wikiformat.c
+++ src/wikiformat.c
@@ -1975,11 +1975,11 @@
1975 blob_read_from_file(&in, g.argv[2], ExtFILE);
1976 mType = wiki_convert(&in, &out, flags);
1977 if( bText ){
1978 Blob txt;
1979 blob_init(&txt, 0, 0);
1980 html_to_plaintext(blob_str(&out),&txt, HTOT_VT100);
1981 blob_reset(&out);
1982 out = txt;
1983 }
1984 blob_write_to_file(&out, "-");
1985 if( showType ){
@@ -2034,11 +2034,11 @@
2034 safe_html_context( bSafe ? DOCSRC_UNTRUSTED : DOCSRC_TRUSTED );
2035 safe_html(&out);
2036 if( bText ){
2037 Blob txt;
2038 blob_init(&txt, 0, 0);
2039 html_to_plaintext(blob_str(&out), &txt, HTOT_VT100);
2040 blob_reset(&out);
2041 out = txt;
2042 }
2043 blob_write_to_file(&out, "-");
2044 blob_reset(&in);
@@ -2505,25 +2505,37 @@
2505 fossil_puts(blob_buffer(&out), 0, blob_size(&out));
2506 blob_reset(&out);
2507 }
2508 }
2509
2510 #if INTERFACE
2511 /*
2512 ** Allowed flag options for html_to_plaintext().
2513 */
2514 #define HTOT_VT100 0x0001 /* <mark> becomes ^[[91m */
2515 #define HTOT_NO_WS 0x0002 /* Collapse whitespace to a single space */
2516
2517 #endif /* INTERFACE */
2518
2519 /*
2520 ** Remove all HTML markup from the input text. The output written into
2521 ** pOut is pure text.
2522 **
2523 ** Put the title on the first line, if there is any <title> markup.
2524 ** If there is no <title>, then create a blank first line.
2525 */
2526 void html_to_plaintext(const char *zIn, Blob *pOut, int mFlags){
2527 int n;
2528 int i, j;
2529 int bNoWS = 0; /* Transform WS into a single space */
2530 int seenText = 0; /* True after first non-whitespace seen */
2531 int nNL = 0; /* Number of \n characters at the end of pOut */
2532 int nWS = 0; /* True if pOut ends with whitespace */
2533 int nMark = 0; /* True if inside of <mark>..</mark> */
2534
2535 while( fossil_isspace(zIn[0]) ) zIn++; /* Skip leading whitespace */
2536 if( mFlags & HTOT_NO_WS ) bNoWS = 1;
2537 while( zIn[0] ){
2538 n = html_token_length(zIn);
2539 if( zIn[0]=='<' && n>1 ){
2540 int isCloseTag;
2541 int eTag;
@@ -2543,13 +2555,30 @@
2555 if( fossil_strnicmp(zIn, "</style",7)==0 ) break;
2556 zIn += n;
2557 }
2558 if( zIn[0]=='<' ) zIn += n;
2559 continue;
2560 }
2561 if( eTag==MARKUP_INVALID && strcmp(zTag,"mark")==0 ){
2562 if( (mFlags & HTOT_VT100)!=0 ){
2563 if( isCloseTag && nMark ){
2564 blob_append(pOut, "\033[0m", 4);
2565 nMark = 0;
2566 }else if( !isCloseTag && !nMark ){
2567 blob_append(pOut, "\033[91m", 5);
2568 nMark = 1;
2569 }
2570 }
2571 zIn += n;
2572 continue;
2573 }
2574 if( eTag==MARKUP_TITLE ){
2575 if( isCloseTag && (mFlags & HTOT_NO_WS)==0 ){
2576 bNoWS = 0;
2577 }else{
2578 bNoWS = 1;
2579 }
2580 }
2581 if( !isCloseTag && seenText && (eType & (MUTYPE_BLOCK|MUTYPE_TABLE))!=0 ){
2582 if( nNL==0 ){
2583 blob_append_char(pOut, '\n');
2584 nNL++;
@@ -2557,11 +2586,11 @@
2586 nWS = 1;
2587 }
2588 }else if( fossil_isspace(zIn[0]) ){
2589 if( seenText ){
2590 nNL = 0;
2591 if( !bNoWS ){ /* '\n' -> ' ' within <title> */
2592 for(i=0; i<n; i++) if( zIn[i]=='\n' ) nNL++;
2593 }
2594 if( !nWS ){
2595 blob_append_char(pOut, nNL ? '\n' : ' ');
2596 nWS = 1;
@@ -2591,11 +2620,11 @@
2620 if( fossil_isspace(c) ){
2621 if( nWS==0 && seenText ) blob_append_char(pOut, c);
2622 nWS = 1;
2623 nNL = c=='\n';
2624 }else{
2625 if( !seenText && !bNoWS ) blob_append_char(pOut, '\n');
2626 seenText = 1;
2627 nNL = nWS = 0;
2628 if( c<0x00080 ){
2629 blob_append_char(pOut, c & 0xff);
2630 }else if( c<0x00800 ){
@@ -2611,40 +2640,48 @@
2640 blob_append_char(pOut, 0x80 + (u8)((c>>6)&0x3f));
2641 blob_append_char(pOut, 0x80 + (u8)(c&0x3f));
2642 }
2643 }
2644 }else{
2645 if( !seenText && !bNoWS ) blob_append_char(pOut, '\n');
2646 seenText = 1;
2647 nNL = nWS = 0;
2648 blob_append(pOut, zIn, n);
2649 }
2650 zIn += n;
2651 }
2652 if( nMark ) blob_append(pOut, "\033[0m", 4);
2653 if( nNL==0 ) blob_append_char(pOut, '\n');
2654 }
2655
2656 /*
2657 ** COMMAND: test-html-to-text
2658 **
2659 ** Usage: %fossil test-html-to-text [OPTIONS] FILE ...
2660 **
2661 ** Read all files named on the command-line. Convert the file
2662 ** content from HTML to text and write the results on standard
2663 ** output.
2664 **
2665 ** This command is intended as a test and debug interface for
2666 ** the html_to_plaintext() routine.
2667 **
2668 ** Options:
2669 **
2670 ** --vt100 Translate <mark> and </mark> into ANSI/VT100
2671 ** escapes to highlight the contained text.
2672 */
2673 void test_html_to_text(void){
2674 Blob in, out;
2675 int i;
2676 int mFlags = 0;
2677 if( find_option("vt100",0,0)!=0 ) mFlags |= HTOT_VT100;
2678
2679 for(i=2; i<g.argc; i++){
2680 blob_read_from_file(&in, g.argv[i], ExtFILE);
2681 blob_zero(&out);
2682 html_to_plaintext(blob_str(&in), &out, mFlags);
2683 blob_reset(&in);
2684 fossil_puts(blob_buffer(&out), 0, blob_size(&out));
2685 blob_reset(&out);
2686 }
2687 }
2688

Keyboard Shortcuts

Open search /
Next entry (timeline) j
Previous entry (timeline) k
Open focused entry Enter
Show this help ?
Toggle theme Top nav button