| | @@ -1975,11 +1975,11 @@ |
| 1975 | 1975 | blob_read_from_file(&in, g.argv[2], ExtFILE); |
| 1976 | 1976 | mType = wiki_convert(&in, &out, flags); |
| 1977 | 1977 | if( bText ){ |
| 1978 | 1978 | Blob txt; |
| 1979 | 1979 | blob_init(&txt, 0, 0); |
| 1980 | | - html_to_plaintext(blob_str(&out),&txt); |
| 1980 | + html_to_plaintext(blob_str(&out),&txt, HTOT_VT100); |
| 1981 | 1981 | blob_reset(&out); |
| 1982 | 1982 | out = txt; |
| 1983 | 1983 | } |
| 1984 | 1984 | blob_write_to_file(&out, "-"); |
| 1985 | 1985 | if( showType ){ |
| | @@ -2034,11 +2034,11 @@ |
| 2034 | 2034 | safe_html_context( bSafe ? DOCSRC_UNTRUSTED : DOCSRC_TRUSTED ); |
| 2035 | 2035 | safe_html(&out); |
| 2036 | 2036 | if( bText ){ |
| 2037 | 2037 | Blob txt; |
| 2038 | 2038 | blob_init(&txt, 0, 0); |
| 2039 | | - html_to_plaintext(blob_str(&out), &txt); |
| 2039 | + html_to_plaintext(blob_str(&out), &txt, HTOT_VT100); |
| 2040 | 2040 | blob_reset(&out); |
| 2041 | 2041 | out = txt; |
| 2042 | 2042 | } |
| 2043 | 2043 | blob_write_to_file(&out, "-"); |
| 2044 | 2044 | blob_reset(&in); |
| | @@ -2505,25 +2505,37 @@ |
| 2505 | 2505 | fossil_puts(blob_buffer(&out), 0, blob_size(&out)); |
| 2506 | 2506 | blob_reset(&out); |
| 2507 | 2507 | } |
| 2508 | 2508 | } |
| 2509 | 2509 | |
| 2510 | +#if INTERFACE |
| 2511 | +/* |
| 2512 | +** Allowed flag options for html_to_plaintext(). |
| 2513 | +*/ |
| 2514 | +#define HTOT_VT100 0x0001 /* <mark> becomes ^[[91m */ |
| 2515 | +#define HTOT_NO_WS 0x0002 /* Collapse whitespace to a single space */ |
| 2516 | + |
| 2517 | +#endif /* INTERFACE */ |
| 2518 | + |
| 2510 | 2519 | /* |
| 2511 | 2520 | ** Remove all HTML markup from the input text. The output written into |
| 2512 | 2521 | ** pOut is pure text. |
| 2513 | 2522 | ** |
| 2514 | 2523 | ** Put the title on the first line, if there is any <title> markup. |
| 2515 | 2524 | ** If there is no <title>, then create a blank first line. |
| 2516 | 2525 | */ |
| 2517 | | -void html_to_plaintext(const char *zIn, Blob *pOut){ |
| 2526 | +void html_to_plaintext(const char *zIn, Blob *pOut, int mFlags){ |
| 2518 | 2527 | int n; |
| 2519 | 2528 | int i, j; |
| 2520 | | - int inTitle = 0; /* True between <title>...</title> */ |
| 2529 | + int bNoWS = 0; /* Transform WS into a single space */ |
| 2521 | 2530 | int seenText = 0; /* True after first non-whitespace seen */ |
| 2522 | 2531 | int nNL = 0; /* Number of \n characters at the end of pOut */ |
| 2523 | 2532 | int nWS = 0; /* True if pOut ends with whitespace */ |
| 2524 | | - while( fossil_isspace(zIn[0]) ) zIn++; |
| 2533 | + int nMark = 0; /* True if inside of <mark>..</mark> */ |
| 2534 | + |
| 2535 | + while( fossil_isspace(zIn[0]) ) zIn++; /* Skip leading whitespace */ |
| 2536 | + if( mFlags & HTOT_NO_WS ) bNoWS = 1; |
| 2525 | 2537 | while( zIn[0] ){ |
| 2526 | 2538 | n = html_token_length(zIn); |
| 2527 | 2539 | if( zIn[0]=='<' && n>1 ){ |
| 2528 | 2540 | int isCloseTag; |
| 2529 | 2541 | int eTag; |
| | @@ -2543,13 +2555,30 @@ |
| 2543 | 2555 | if( fossil_strnicmp(zIn, "</style",7)==0 ) break; |
| 2544 | 2556 | zIn += n; |
| 2545 | 2557 | } |
| 2546 | 2558 | if( zIn[0]=='<' ) zIn += n; |
| 2547 | 2559 | continue; |
| 2560 | + } |
| 2561 | + if( eTag==MARKUP_INVALID && strcmp(zTag,"mark")==0 ){ |
| 2562 | + if( (mFlags & HTOT_VT100)!=0 ){ |
| 2563 | + if( isCloseTag && nMark ){ |
| 2564 | + blob_append(pOut, "\033[0m", 4); |
| 2565 | + nMark = 0; |
| 2566 | + }else if( !isCloseTag && !nMark ){ |
| 2567 | + blob_append(pOut, "\033[91m", 5); |
| 2568 | + nMark = 1; |
| 2569 | + } |
| 2570 | + } |
| 2571 | + zIn += n; |
| 2572 | + continue; |
| 2548 | 2573 | } |
| 2549 | 2574 | if( eTag==MARKUP_TITLE ){ |
| 2550 | | - inTitle = !isCloseTag; |
| 2575 | + if( isCloseTag && (mFlags & HTOT_NO_WS)==0 ){ |
| 2576 | + bNoWS = 0; |
| 2577 | + }else{ |
| 2578 | + bNoWS = 1; |
| 2579 | + } |
| 2551 | 2580 | } |
| 2552 | 2581 | if( !isCloseTag && seenText && (eType & (MUTYPE_BLOCK|MUTYPE_TABLE))!=0 ){ |
| 2553 | 2582 | if( nNL==0 ){ |
| 2554 | 2583 | blob_append_char(pOut, '\n'); |
| 2555 | 2584 | nNL++; |
| | @@ -2557,11 +2586,11 @@ |
| 2557 | 2586 | nWS = 1; |
| 2558 | 2587 | } |
| 2559 | 2588 | }else if( fossil_isspace(zIn[0]) ){ |
| 2560 | 2589 | if( seenText ){ |
| 2561 | 2590 | nNL = 0; |
| 2562 | | - if( !inTitle ){ /* '\n' -> ' ' within <title> */ |
| 2591 | + if( !bNoWS ){ /* '\n' -> ' ' within <title> */ |
| 2563 | 2592 | for(i=0; i<n; i++) if( zIn[i]=='\n' ) nNL++; |
| 2564 | 2593 | } |
| 2565 | 2594 | if( !nWS ){ |
| 2566 | 2595 | blob_append_char(pOut, nNL ? '\n' : ' '); |
| 2567 | 2596 | nWS = 1; |
| | @@ -2591,11 +2620,11 @@ |
| 2591 | 2620 | if( fossil_isspace(c) ){ |
| 2592 | 2621 | if( nWS==0 && seenText ) blob_append_char(pOut, c); |
| 2593 | 2622 | nWS = 1; |
| 2594 | 2623 | nNL = c=='\n'; |
| 2595 | 2624 | }else{ |
| 2596 | | - if( !seenText && !inTitle ) blob_append_char(pOut, '\n'); |
| 2625 | + if( !seenText && !bNoWS ) blob_append_char(pOut, '\n'); |
| 2597 | 2626 | seenText = 1; |
| 2598 | 2627 | nNL = nWS = 0; |
| 2599 | 2628 | if( c<0x00080 ){ |
| 2600 | 2629 | blob_append_char(pOut, c & 0xff); |
| 2601 | 2630 | }else if( c<0x00800 ){ |
| | @@ -2611,40 +2640,48 @@ |
| 2611 | 2640 | blob_append_char(pOut, 0x80 + (u8)((c>>6)&0x3f)); |
| 2612 | 2641 | blob_append_char(pOut, 0x80 + (u8)(c&0x3f)); |
| 2613 | 2642 | } |
| 2614 | 2643 | } |
| 2615 | 2644 | }else{ |
| 2616 | | - if( !seenText && !inTitle ) blob_append_char(pOut, '\n'); |
| 2645 | + if( !seenText && !bNoWS ) blob_append_char(pOut, '\n'); |
| 2617 | 2646 | seenText = 1; |
| 2618 | 2647 | nNL = nWS = 0; |
| 2619 | 2648 | blob_append(pOut, zIn, n); |
| 2620 | 2649 | } |
| 2621 | 2650 | zIn += n; |
| 2622 | 2651 | } |
| 2652 | + if( nMark ) blob_append(pOut, "\033[0m", 4); |
| 2623 | 2653 | if( nNL==0 ) blob_append_char(pOut, '\n'); |
| 2624 | 2654 | } |
| 2625 | 2655 | |
| 2626 | 2656 | /* |
| 2627 | 2657 | ** COMMAND: test-html-to-text |
| 2628 | 2658 | ** |
| 2629 | | -** Usage: %fossil test-html-to-text FILE ... |
| 2659 | +** Usage: %fossil test-html-to-text [OPTIONS] FILE ... |
| 2630 | 2660 | ** |
| 2631 | 2661 | ** Read all files named on the command-line. Convert the file |
| 2632 | 2662 | ** content from HTML to text and write the results on standard |
| 2633 | 2663 | ** output. |
| 2634 | 2664 | ** |
| 2635 | 2665 | ** This command is intended as a test and debug interface for |
| 2636 | 2666 | ** the html_to_plaintext() routine. |
| 2667 | +** |
| 2668 | +** Options: |
| 2669 | +** |
| 2670 | +** --vt100 Translate <mark> and </mark> into ANSI/VT100 |
| 2671 | +** escapes to highlight the contained text. |
| 2637 | 2672 | */ |
| 2638 | 2673 | void test_html_to_text(void){ |
| 2639 | 2674 | Blob in, out; |
| 2640 | 2675 | int i; |
| 2676 | + int mFlags = 0; |
| 2677 | + if( find_option("vt100",0,0)!=0 ) mFlags |= HTOT_VT100; |
| 2641 | 2678 | |
| 2642 | 2679 | for(i=2; i<g.argc; i++){ |
| 2643 | 2680 | blob_read_from_file(&in, g.argv[i], ExtFILE); |
| 2644 | 2681 | blob_zero(&out); |
| 2645 | | - html_to_plaintext(blob_str(&in), &out); |
| 2682 | + html_to_plaintext(blob_str(&in), &out, mFlags); |
| 2646 | 2683 | blob_reset(&in); |
| 2647 | 2684 | fossil_puts(blob_buffer(&out), 0, blob_size(&out)); |
| 2648 | 2685 | blob_reset(&out); |
| 2649 | 2686 | } |
| 2650 | 2687 | } |
| 2651 | 2688 | |